@telnyx/voice-agent-tester 0.4.4 → 0.4.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +13 -0
- package/README.md +185 -161
- package/applications/elevenlabs.yaml +1 -1
- package/javascript/audio_output_hooks.js +92 -2
- package/package.json +1 -1
- package/src/index.js +31 -12
- package/src/report.js +169 -90
- package/src/voice-agent-tester.js +8 -4
- package/tests/voice-agent-tester.test.js +133 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,18 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.4.5](https://github.com/team-telnyx/voice-agent-tester/compare/v0.4.4...v0.4.5) (2026-03-16)
|
|
4
|
+
|
|
5
|
+
### Bug Fixes
|
|
6
|
+
|
|
7
|
+
* add event-based fallback for audio monitoring (ElevenLabs support) ([#27](https://github.com/team-telnyx/voice-agent-tester/issues/27)) ([6051b5e](https://github.com/team-telnyx/voice-agent-tester/commit/6051b5e949376951f0fb046cffcc5a2a5c250e19))
|
|
8
|
+
* align comparison metrics by scenario step index, not absolute step number ([#23](https://github.com/team-telnyx/voice-agent-tester/issues/23)) ([e4c485b](https://github.com/team-telnyx/voice-agent-tester/commit/e4c485b6eae5e9a6d60f11745b46997a183fc180)), closes [#1](https://github.com/team-telnyx/voice-agent-tester/issues/1) [#2](https://github.com/team-telnyx/voice-agent-tester/issues/2)
|
|
9
|
+
* make ElevenLabs branch-id optional for comparison mode ([#24](https://github.com/team-telnyx/voice-agent-tester/issues/24)) ([3f1735a](https://github.com/team-telnyx/voice-agent-tester/commit/3f1735a6a02e6c1edc4b6e17a6be4087127bded8))
|
|
10
|
+
* single headline number in comparison, per-response in --debug ([#26](https://github.com/team-telnyx/voice-agent-tester/issues/26)) ([a482129](https://github.com/team-telnyx/voice-agent-tester/commit/a482129c1bfe49d28aca7dec8230d30e5b6d8f8a)), closes [#1](https://github.com/team-telnyx/voice-agent-tester/issues/1) [#2](https://github.com/team-telnyx/voice-agent-tester/issues/2)
|
|
11
|
+
|
|
12
|
+
### Documentation
|
|
13
|
+
|
|
14
|
+
* restructure README with comparison mode front and center ([#25](https://github.com/team-telnyx/voice-agent-tester/issues/25)) ([f15cbcd](https://github.com/team-telnyx/voice-agent-tester/commit/f15cbcd8707cded8081d00b90accf09fd77be169))
|
|
15
|
+
|
|
3
16
|
## [0.4.4](https://github.com/team-telnyx/voice-agent-tester/compare/v0.4.3...v0.4.4) (2026-03-11)
|
|
4
17
|
|
|
5
18
|
### Features
|
package/README.md
CHANGED
|
@@ -3,160 +3,119 @@
|
|
|
3
3
|
[](https://github.com/team-telnyx/voice-agent-tester/actions/workflows/ci.yml)
|
|
4
4
|
[](https://www.npmjs.com/package/@telnyx/voice-agent-tester)
|
|
5
5
|
|
|
6
|
-
|
|
6
|
+
Automated benchmarking CLI for voice AI agents. Import your assistant from any provider, run identical test scenarios on both platforms, and get a side-by-side latency comparison.
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
Supports **Telnyx**, **ElevenLabs**, **Vapi**, and **Retell**.
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
## Compare Your Voice Agent Against Telnyx
|
|
11
11
|
|
|
12
|
-
|
|
13
|
-
npx @telnyx/voice-agent-tester@latest -a applications/telnyx.yaml -s scenarios/appointment.yaml --assistant-id <YOUR_ASSISTANT_ID>
|
|
14
|
-
```
|
|
12
|
+
The tool imports your assistant from an external provider into Telnyx, then runs the **same scenario** on both platforms and produces a head-to-head latency report:
|
|
15
13
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
14
|
+
```
|
|
15
|
+
📈 Latency Comparison (elapsed_time):
|
|
16
|
+
--------------------------------------------------------------------------------
|
|
17
|
+
Metric vapi Telnyx Delta Winner
|
|
18
|
+
--------------------------------------------------------------------------------
|
|
19
|
+
Response #1 (wait_for_voice_elapsed_time) 2849ms 1552ms -1297ms (-45.5%) 🏆 Telnyx
|
|
20
|
+
Response #2 (wait_for_voice_elapsed_time) 3307ms 704ms -2603ms (-78.7%) 🏆 Telnyx
|
|
21
|
+
--------------------------------------------------------------------------------
|
|
22
|
+
|
|
23
|
+
📊 Overall Summary:
|
|
24
|
+
Compared 2 matched response latencies
|
|
25
|
+
vapi total latency: 6156ms
|
|
26
|
+
Telnyx total latency: 2256ms
|
|
27
|
+
Difference: -3900ms (-63.3%)
|
|
28
|
+
|
|
29
|
+
🏆 Result: Telnyx is faster overall
|
|
21
30
|
```
|
|
22
31
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
| Option | Default | Description |
|
|
26
|
-
|--------|---------|-------------|
|
|
27
|
-
| `-a, --applications` | required | Application config path(s) or folder |
|
|
28
|
-
| `-s, --scenarios` | required | Scenario config path(s) or folder |
|
|
29
|
-
| `--assistant-id` | | Telnyx or provider assistant ID |
|
|
30
|
-
| `--api-key` | | Telnyx API key for authentication |
|
|
31
|
-
| `--provider` | | Import from provider (`vapi`, `elevenlabs`, `retell`) |
|
|
32
|
-
| `--provider-api-key` | | External provider API key (required with `--provider`) |
|
|
33
|
-
| `--provider-import-id` | | Provider assistant ID to import (required with `--provider`) |
|
|
34
|
-
| `--share-key` | | Vapi share key for comparison mode (prompted if missing) |
|
|
35
|
-
| `--branch-id` | | ElevenLabs branch ID for comparison mode (prompted if missing) |
|
|
36
|
-
| `--compare` | `true` | Run both provider direct and Telnyx import benchmarks |
|
|
37
|
-
| `--no-compare` | | Disable comparison (run only Telnyx import) |
|
|
38
|
-
| `-d, --debug` | `false` | Enable detailed timeout diagnostics |
|
|
39
|
-
| `-v, --verbose` | `false` | Show browser console logs |
|
|
40
|
-
| `--headless` | `true` | Run browser in headless mode |
|
|
41
|
-
| `--repeat` | `1` | Number of repetitions per combination |
|
|
42
|
-
| `-c, --concurrency` | `1` | Number of parallel tests |
|
|
43
|
-
| `-r, --report` | | Generate CSV report to specified file |
|
|
44
|
-
| `-p, --params` | | URL template params (e.g., `key=value,key2=value2`) |
|
|
45
|
-
| `--application-tags` | | Filter applications by comma-separated tags |
|
|
46
|
-
| `--scenario-tags` | | Filter scenarios by comma-separated tags |
|
|
47
|
-
| `--assets-server` | `http://localhost:3333` | Assets server URL |
|
|
48
|
-
| `--audio-url` | | URL to audio file to play as input during entire benchmark |
|
|
49
|
-
| `--audio-volume` | `1.0` | Volume level for audio input (0.0 to 1.0) |
|
|
50
|
-
|
|
51
|
-
## Bundled Configs
|
|
52
|
-
|
|
53
|
-
| Application Config | Provider |
|
|
54
|
-
|-------------------|----------|
|
|
55
|
-
| `applications/telnyx.yaml` | Telnyx AI Widget |
|
|
56
|
-
| `applications/elevenlabs.yaml` | ElevenLabs |
|
|
57
|
-
| `applications/vapi.yaml` | Vapi |
|
|
58
|
-
| `applications/retell.yaml` | Retell |
|
|
59
|
-
| `applications/livetok.yaml` | Livetok |
|
|
60
|
-
|
|
61
|
-
Scenarios:
|
|
62
|
-
- `scenarios/appointment.yaml` - Basic appointment booking test
|
|
63
|
-
- `scenarios/appointment_with_noise.yaml` - Appointment with background noise (pre-mixed audio)
|
|
64
|
-
|
|
65
|
-
## Background Noise Testing
|
|
66
|
-
|
|
67
|
-
Test voice agents' performance with ambient noise (e.g., crowd chatter, cafe environment). Background noise is pre-mixed into audio files to simulate real-world conditions where users speak to voice agents in noisy environments.
|
|
68
|
-
|
|
69
|
-
### Running with Background Noise
|
|
32
|
+
### Vapi vs Telnyx
|
|
70
33
|
|
|
71
34
|
```bash
|
|
72
|
-
# Telnyx with background noise
|
|
73
|
-
npx @telnyx/voice-agent-tester@latest \
|
|
74
|
-
-a applications/telnyx.yaml \
|
|
75
|
-
-s scenarios/appointment_with_noise.yaml \
|
|
76
|
-
--assistant-id <YOUR_ASSISTANT_ID>
|
|
77
|
-
|
|
78
|
-
# Compare with no noise (same assistant)
|
|
79
35
|
npx @telnyx/voice-agent-tester@latest \
|
|
80
36
|
-a applications/telnyx.yaml \
|
|
81
37
|
-s scenarios/appointment.yaml \
|
|
82
|
-
--
|
|
38
|
+
--provider vapi \
|
|
39
|
+
--share-key <VAPI_SHARE_KEY> \
|
|
40
|
+
--api-key <TELNYX_API_KEY> \
|
|
41
|
+
--provider-api-key <VAPI_API_KEY> \
|
|
42
|
+
--provider-import-id <VAPI_ASSISTANT_ID>
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### ElevenLabs vs Telnyx
|
|
83
46
|
|
|
84
|
-
|
|
47
|
+
```bash
|
|
85
48
|
npx @telnyx/voice-agent-tester@latest \
|
|
86
49
|
-a applications/telnyx.yaml \
|
|
87
|
-
-s scenarios/
|
|
88
|
-
--
|
|
89
|
-
-
|
|
50
|
+
-s scenarios/appointment.yaml \
|
|
51
|
+
--provider elevenlabs \
|
|
52
|
+
--api-key <TELNYX_API_KEY> \
|
|
53
|
+
--provider-api-key <ELEVENLABS_API_KEY> \
|
|
54
|
+
--provider-import-id <ELEVENLABS_AGENT_ID>
|
|
90
55
|
```
|
|
91
56
|
|
|
92
|
-
###
|
|
93
|
-
|
|
94
|
-
Play any audio file from a URL as input throughout the entire benchmark run. The audio is sent to the voice agent as microphone input.
|
|
57
|
+
### Retell vs Telnyx
|
|
95
58
|
|
|
96
59
|
```bash
|
|
97
|
-
# Use custom audio input from URL
|
|
98
60
|
npx @telnyx/voice-agent-tester@latest \
|
|
99
61
|
-a applications/telnyx.yaml \
|
|
100
62
|
-s scenarios/appointment.yaml \
|
|
101
|
-
--
|
|
102
|
-
--
|
|
103
|
-
--
|
|
63
|
+
--provider retell \
|
|
64
|
+
--api-key <TELNYX_API_KEY> \
|
|
65
|
+
--provider-api-key <RETELL_API_KEY> \
|
|
66
|
+
--provider-import-id <RETELL_AGENT_ID>
|
|
104
67
|
```
|
|
105
68
|
|
|
106
|
-
|
|
107
|
-
- Testing with custom audio inputs
|
|
108
|
-
- Using longer audio tracks that play throughout the benchmark
|
|
109
|
-
- A/B testing different audio sources
|
|
69
|
+
### How Comparison Works
|
|
110
70
|
|
|
111
|
-
|
|
71
|
+
1. **Import** — The assistant is imported from the external provider into Telnyx
|
|
72
|
+
2. **Phase 1: Provider Direct** — Runs the scenario on the provider's native widget
|
|
73
|
+
3. **Phase 2: Telnyx Import** — Runs the same scenario on the Telnyx-imported assistant
|
|
74
|
+
4. **Report** — Produces a side-by-side comparison with latency delta and winner per response
|
|
112
75
|
|
|
113
|
-
|
|
114
|
-
|------|-------------|
|
|
115
|
-
| `hello_make_an_appointment.mp3` | Clean appointment request |
|
|
116
|
-
| `hello_make_an_appointment_with_noise.mp3` | Appointment request with crowd noise |
|
|
117
|
-
| `appointment_data.mp3` | Clean appointment details |
|
|
118
|
-
| `appointment_data_with_noise.mp3` | Appointment details with crowd noise |
|
|
76
|
+
### Provider-Specific Keys
|
|
119
77
|
|
|
120
|
-
|
|
78
|
+
Some providers need an extra key to load their demo widget. If not passed via CLI, the tool prompts with instructions.
|
|
121
79
|
|
|
122
|
-
|
|
80
|
+
| Provider | Flag | Required? | How to find it |
|
|
81
|
+
|----------|------|-----------|----------------|
|
|
82
|
+
| Vapi | `--share-key` | Yes | Dashboard → select assistant → click 🔗 link icon next to the assistant ID |
|
|
83
|
+
| ElevenLabs | `--branch-id` | No | Dashboard → Agents → select agent → Publish dropdown → "Copy shareable link" |
|
|
123
84
|
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
-
|
|
131
|
-
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
-
|
|
135
|
-
|
|
136
|
-
-
|
|
137
|
-
metrics: elapsed_time
|
|
138
|
-
- action: wait_for_silence
|
|
139
|
-
- action: speak
|
|
140
|
-
file: appointment_data_with_noise.mp3
|
|
141
|
-
- action: wait_for_voice
|
|
142
|
-
metrics: elapsed_time
|
|
85
|
+
### Import Only (Skip Comparison)
|
|
86
|
+
|
|
87
|
+
To import without running the provider benchmark:
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
npx @telnyx/voice-agent-tester@latest \
|
|
91
|
+
-a applications/telnyx.yaml \
|
|
92
|
+
-s scenarios/appointment.yaml \
|
|
93
|
+
--provider vapi \
|
|
94
|
+
--no-compare \
|
|
95
|
+
--api-key <TELNYX_API_KEY> \
|
|
96
|
+
--provider-api-key <VAPI_API_KEY> \
|
|
97
|
+
--provider-import-id <VAPI_ASSISTANT_ID>
|
|
143
98
|
```
|
|
144
99
|
|
|
145
|
-
|
|
100
|
+
## Quick Start
|
|
146
101
|
|
|
147
|
-
|
|
102
|
+
Run directly with npx (no installation required):
|
|
148
103
|
|
|
149
|
-
```
|
|
150
|
-
|
|
151
|
-
telnyx
|
|
104
|
+
```bash
|
|
105
|
+
npx @telnyx/voice-agent-tester@latest \
|
|
106
|
+
-a applications/telnyx.yaml \
|
|
107
|
+
-s scenarios/appointment.yaml \
|
|
108
|
+
--assistant-id <YOUR_ASSISTANT_ID>
|
|
152
109
|
```
|
|
153
110
|
|
|
154
|
-
|
|
155
|
-
- Response latency
|
|
156
|
-
- Speech recognition accuracy
|
|
157
|
-
- Overall conversation flow
|
|
111
|
+
Or install globally:
|
|
158
112
|
|
|
159
|
-
|
|
113
|
+
```bash
|
|
114
|
+
npm install -g @telnyx/voice-agent-tester
|
|
115
|
+
voice-agent-tester -a applications/telnyx.yaml -s scenarios/appointment.yaml --assistant-id <YOUR_ASSISTANT_ID>
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Provider Examples
|
|
160
119
|
|
|
161
120
|
### Telnyx
|
|
162
121
|
|
|
@@ -185,78 +144,143 @@ npx @telnyx/voice-agent-tester@latest \
|
|
|
185
144
|
--assistant-id <ASSISTANT_ID>
|
|
186
145
|
```
|
|
187
146
|
|
|
188
|
-
##
|
|
147
|
+
## CLI Reference
|
|
189
148
|
|
|
190
|
-
|
|
149
|
+
| Option | Default | Description |
|
|
150
|
+
|--------|---------|-------------|
|
|
151
|
+
| `-a, --applications` | required | Application config path(s) or folder |
|
|
152
|
+
| `-s, --scenarios` | required | Scenario config path(s) or folder |
|
|
153
|
+
| `--assistant-id` | | Telnyx or provider assistant ID |
|
|
154
|
+
| `--api-key` | | Telnyx API key |
|
|
155
|
+
| `--provider` | | Import from provider (`vapi`, `elevenlabs`, `retell`) |
|
|
156
|
+
| `--provider-api-key` | | External provider API key |
|
|
157
|
+
| `--provider-import-id` | | Provider assistant/agent ID to import |
|
|
158
|
+
| `--share-key` | | Vapi share key for comparison mode |
|
|
159
|
+
| `--branch-id` | | ElevenLabs branch ID (optional) |
|
|
160
|
+
| `--compare` | `true` | Run provider direct + Telnyx import benchmarks |
|
|
161
|
+
| `--no-compare` | | Skip provider direct benchmark |
|
|
162
|
+
| `-d, --debug` | `false` | Detailed timeout diagnostics |
|
|
163
|
+
| `-v, --verbose` | `false` | Show browser console logs |
|
|
164
|
+
| `--headless` | `true` | Run browser in headless mode |
|
|
165
|
+
| `--repeat` | `1` | Repetitions per app+scenario combination |
|
|
166
|
+
| `-c, --concurrency` | `1` | Parallel test runs |
|
|
167
|
+
| `-r, --report` | | CSV report output path |
|
|
168
|
+
| `-p, --params` | | URL template params (`key=value,key2=value2`) |
|
|
169
|
+
| `--retries` | `0` | Retry failed runs |
|
|
170
|
+
| `--application-tags` | | Filter applications by tags |
|
|
171
|
+
| `--scenario-tags` | | Filter scenarios by tags |
|
|
172
|
+
| `--record` | `false` | Record video+audio (webm) |
|
|
173
|
+
| `--audio-url` | | URL to audio file played as input during run |
|
|
174
|
+
| `--audio-volume` | `1.0` | Audio input volume (0.0–1.0) |
|
|
175
|
+
| `--assets-server` | `http://localhost:3333` | Assets server URL |
|
|
191
176
|
|
|
192
|
-
|
|
193
|
-
2. **Telnyx Import** - Benchmarks the same assistant after importing to Telnyx
|
|
177
|
+
## Bundled Configs
|
|
194
178
|
|
|
195
|
-
|
|
179
|
+
**Applications:**
|
|
196
180
|
|
|
197
|
-
|
|
181
|
+
| Config | Provider |
|
|
182
|
+
|--------|----------|
|
|
183
|
+
| `applications/telnyx.yaml` | Telnyx AI Widget |
|
|
184
|
+
| `applications/elevenlabs.yaml` | ElevenLabs |
|
|
185
|
+
| `applications/vapi.yaml` | Vapi |
|
|
186
|
+
| `applications/retell.yaml` | Retell |
|
|
187
|
+
|
|
188
|
+
**Scenarios:**
|
|
198
189
|
|
|
199
|
-
|
|
|
200
|
-
|
|
201
|
-
|
|
|
202
|
-
|
|
|
190
|
+
| Config | Description |
|
|
191
|
+
|--------|-------------|
|
|
192
|
+
| `scenarios/appointment.yaml` | Appointment booking test |
|
|
193
|
+
| `scenarios/appointment_with_noise.yaml` | Appointment with background crowd noise |
|
|
203
194
|
|
|
204
|
-
|
|
195
|
+
## Background Noise Testing
|
|
205
196
|
|
|
206
|
-
|
|
197
|
+
Test how voice agents perform with ambient noise by using pre-mixed audio files:
|
|
207
198
|
|
|
208
199
|
```bash
|
|
200
|
+
# With background noise
|
|
201
|
+
npx @telnyx/voice-agent-tester@latest \
|
|
202
|
+
-a applications/telnyx.yaml \
|
|
203
|
+
-s scenarios/appointment_with_noise.yaml \
|
|
204
|
+
--assistant-id <ASSISTANT_ID>
|
|
205
|
+
|
|
206
|
+
# Without noise (same assistant, compare results)
|
|
209
207
|
npx @telnyx/voice-agent-tester@latest \
|
|
210
208
|
-a applications/telnyx.yaml \
|
|
211
209
|
-s scenarios/appointment.yaml \
|
|
212
|
-
--
|
|
213
|
-
--share-key <VAPI_SHARE_KEY> \
|
|
214
|
-
--api-key <TELNYX_KEY> \
|
|
215
|
-
--provider-api-key <VAPI_KEY> \
|
|
216
|
-
--provider-import-id <VAPI_ASSISTANT_ID>
|
|
210
|
+
--assistant-id <ASSISTANT_ID>
|
|
217
211
|
```
|
|
218
212
|
|
|
219
|
-
|
|
213
|
+
### Custom Audio Input
|
|
214
|
+
|
|
215
|
+
Play any audio file from a URL as microphone input throughout the benchmark:
|
|
220
216
|
|
|
221
217
|
```bash
|
|
222
218
|
npx @telnyx/voice-agent-tester@latest \
|
|
223
219
|
-a applications/telnyx.yaml \
|
|
224
220
|
-s scenarios/appointment.yaml \
|
|
225
|
-
--
|
|
226
|
-
--
|
|
227
|
-
--
|
|
228
|
-
--provider-api-key <ELEVENLABS_KEY> \
|
|
229
|
-
--provider-import-id <ELEVENLABS_AGENT_ID>
|
|
221
|
+
--assistant-id <ASSISTANT_ID> \
|
|
222
|
+
--audio-url "https://example.com/test-audio.mp3" \
|
|
223
|
+
--audio-volume 0.8
|
|
230
224
|
```
|
|
231
225
|
|
|
232
|
-
|
|
233
|
-
- Run Phase 1: Provider direct benchmark
|
|
234
|
-
- Run Phase 2: Telnyx import benchmark
|
|
235
|
-
- Generate a side-by-side latency comparison report
|
|
226
|
+
### Audio Assets
|
|
236
227
|
|
|
237
|
-
|
|
228
|
+
| File | Description |
|
|
229
|
+
|------|-------------|
|
|
230
|
+
| `hello_make_an_appointment.mp3` | Clean appointment request |
|
|
231
|
+
| `hello_make_an_appointment_with_noise.mp3` | Appointment request + crowd noise |
|
|
232
|
+
| `appointment_data.mp3` | Clean appointment details |
|
|
233
|
+
| `appointment_data_with_noise.mp3` | Appointment details + crowd noise |
|
|
238
234
|
|
|
239
|
-
|
|
235
|
+
## Scenario Configuration
|
|
240
236
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
237
|
+
Scenarios are YAML files with a sequence of steps. Steps with `metrics: elapsed_time` are included in the latency report.
|
|
238
|
+
|
|
239
|
+
```yaml
|
|
240
|
+
# scenarios/appointment.yaml
|
|
241
|
+
steps:
|
|
242
|
+
- action: wait_for_voice # Wait for agent greeting
|
|
243
|
+
- action: wait_for_silence # Wait for greeting to finish
|
|
244
|
+
- action: speak
|
|
245
|
+
file: hello_make_an_appointment.mp3
|
|
246
|
+
- action: wait_for_voice # ← Measured: time to first response
|
|
247
|
+
metrics: elapsed_time
|
|
248
|
+
- action: wait_for_silence
|
|
249
|
+
- action: speak
|
|
250
|
+
file: appointment_data.mp3
|
|
251
|
+
- action: wait_for_voice # ← Measured: time to second response
|
|
252
|
+
metrics: elapsed_time
|
|
250
253
|
```
|
|
251
254
|
|
|
252
|
-
###
|
|
255
|
+
### Available Actions
|
|
256
|
+
|
|
257
|
+
| Action | Description |
|
|
258
|
+
|--------|-------------|
|
|
259
|
+
| `speak` | Play audio (`file`) or synthesize text (`text`) as microphone input |
|
|
260
|
+
| `wait_for_voice` | Wait for the AI agent to start speaking |
|
|
261
|
+
| `wait_for_silence` | Wait for the AI agent to stop speaking |
|
|
262
|
+
| `sleep` | Pause for a fixed duration (`time` in ms) |
|
|
263
|
+
| `click` | Click an element (`selector`) |
|
|
264
|
+
| `click_with_retry` | Click with retries and connection verification |
|
|
265
|
+
| `wait_for_element` | Wait for a DOM element to appear |
|
|
266
|
+
| `type` | Type text into an input field |
|
|
267
|
+
| `fill` | Set an input field value directly |
|
|
268
|
+
| `select` | Select dropdown/checkbox/radio option |
|
|
269
|
+
| `screenshot` | Capture a screenshot |
|
|
270
|
+
| `listen` | Record agent audio, transcribe, and evaluate |
|
|
253
271
|
|
|
254
|
-
|
|
272
|
+
## Debugging
|
|
273
|
+
|
|
274
|
+
If benchmarks fail or time out, use `--debug` for detailed diagnostics including audio monitor state, WebRTC connection info, and RTP stats:
|
|
255
275
|
|
|
256
276
|
```bash
|
|
257
|
-
voice-agent-tester
|
|
277
|
+
npx @telnyx/voice-agent-tester@latest \
|
|
278
|
+
-a applications/telnyx.yaml \
|
|
279
|
+
-s scenarios/appointment.yaml \
|
|
280
|
+
--assistant-id <ASSISTANT_ID> \
|
|
281
|
+
--debug
|
|
258
282
|
```
|
|
259
283
|
|
|
260
284
|
## License
|
|
261
285
|
|
|
262
|
-
MIT
|
|
286
|
+
MIT
|
|
@@ -526,10 +526,98 @@ class AudioElementMonitor {
|
|
|
526
526
|
|
|
527
527
|
console.log(`Started monitoring programmatic audio element: ${elementId}`);
|
|
528
528
|
} catch (error) {
|
|
529
|
-
console.error(`Failed to monitor programmatic audio element ${elementId}:`, error);
|
|
529
|
+
console.error(`Failed to monitor programmatic audio element ${elementId} via analyser:`, error.message);
|
|
530
|
+
console.log(`Falling back to event-based monitoring for ${elementId}`);
|
|
531
|
+
this.monitorViaEvents(audioElement, elementId);
|
|
530
532
|
}
|
|
531
533
|
}
|
|
532
534
|
|
|
535
|
+
/**
|
|
536
|
+
* Fallback monitoring using audio element events (timeupdate/playing/pause).
|
|
537
|
+
* Used when AudioContext-based monitoring fails (e.g., when the audio element
|
|
538
|
+
* is already connected to another AudioContext via MediaStreamDestination).
|
|
539
|
+
*/
|
|
540
|
+
monitorViaEvents(audioElement, elementId) {
|
|
541
|
+
const monitorData = {
|
|
542
|
+
element: audioElement,
|
|
543
|
+
source: null,
|
|
544
|
+
analyser: null,
|
|
545
|
+
dataArray: null,
|
|
546
|
+
isPlaying: false,
|
|
547
|
+
lastAudioTime: 0,
|
|
548
|
+
silenceThreshold: 10,
|
|
549
|
+
checkInterval: null,
|
|
550
|
+
isProgrammatic: true,
|
|
551
|
+
eventBased: true
|
|
552
|
+
};
|
|
553
|
+
|
|
554
|
+
this.monitoredElements.set(elementId, monitorData);
|
|
555
|
+
|
|
556
|
+
// Use timeupdate to detect audio activity — fires ~4x/sec during playback
|
|
557
|
+
let lastTimeUpdate = 0;
|
|
558
|
+
let silenceTimeoutId = null;
|
|
559
|
+
const SILENCE_DELAY = 1500; // ms of no timeupdate before declaring silence
|
|
560
|
+
|
|
561
|
+
const resetSilenceTimer = () => {
|
|
562
|
+
if (silenceTimeoutId) clearTimeout(silenceTimeoutId);
|
|
563
|
+
silenceTimeoutId = setTimeout(() => {
|
|
564
|
+
if (monitorData.isPlaying) {
|
|
565
|
+
monitorData.isPlaying = false;
|
|
566
|
+
this.dispatchAudioEvent('audiostop', elementId, audioElement);
|
|
567
|
+
if (typeof window.__publishEvent === 'function') {
|
|
568
|
+
window.__publishEvent('audiostop', { elementId, timestamp: Date.now() });
|
|
569
|
+
}
|
|
570
|
+
console.log(`Audio stopped (event-based): ${elementId}`);
|
|
571
|
+
}
|
|
572
|
+
}, SILENCE_DELAY);
|
|
573
|
+
};
|
|
574
|
+
|
|
575
|
+
audioElement.addEventListener('timeupdate', () => {
|
|
576
|
+
const now = Date.now();
|
|
577
|
+
// timeupdate fires even when seeking; only count if currentTime advances
|
|
578
|
+
if (audioElement.currentTime > 0 && now - lastTimeUpdate > 50) {
|
|
579
|
+
lastTimeUpdate = now;
|
|
580
|
+
monitorData.lastAudioTime = now;
|
|
581
|
+
|
|
582
|
+
if (!monitorData.isPlaying) {
|
|
583
|
+
monitorData.isPlaying = true;
|
|
584
|
+
this.dispatchAudioEvent('audiostart', elementId, audioElement);
|
|
585
|
+
if (typeof window.__publishEvent === 'function') {
|
|
586
|
+
window.__publishEvent('audiostart', { elementId, timestamp: Date.now() });
|
|
587
|
+
}
|
|
588
|
+
console.log(`Audio started (event-based): ${elementId}`);
|
|
589
|
+
}
|
|
590
|
+
resetSilenceTimer();
|
|
591
|
+
}
|
|
592
|
+
});
|
|
593
|
+
|
|
594
|
+
audioElement.addEventListener('pause', () => {
|
|
595
|
+
if (monitorData.isPlaying) {
|
|
596
|
+
monitorData.isPlaying = false;
|
|
597
|
+
if (silenceTimeoutId) clearTimeout(silenceTimeoutId);
|
|
598
|
+
this.dispatchAudioEvent('audiostop', elementId, audioElement);
|
|
599
|
+
if (typeof window.__publishEvent === 'function') {
|
|
600
|
+
window.__publishEvent('audiostop', { elementId, timestamp: Date.now() });
|
|
601
|
+
}
|
|
602
|
+
console.log(`Audio stopped (event-based, pause): ${elementId}`);
|
|
603
|
+
}
|
|
604
|
+
});
|
|
605
|
+
|
|
606
|
+
audioElement.addEventListener('ended', () => {
|
|
607
|
+
if (monitorData.isPlaying) {
|
|
608
|
+
monitorData.isPlaying = false;
|
|
609
|
+
if (silenceTimeoutId) clearTimeout(silenceTimeoutId);
|
|
610
|
+
this.dispatchAudioEvent('audiostop', elementId, audioElement);
|
|
611
|
+
if (typeof window.__publishEvent === 'function') {
|
|
612
|
+
window.__publishEvent('audiostop', { elementId, timestamp: Date.now() });
|
|
613
|
+
}
|
|
614
|
+
console.log(`Audio stopped (event-based, ended): ${elementId}`);
|
|
615
|
+
}
|
|
616
|
+
});
|
|
617
|
+
|
|
618
|
+
console.log(`Started event-based monitoring for programmatic audio element: ${elementId}`);
|
|
619
|
+
}
|
|
620
|
+
|
|
533
621
|
monitorAudioElement(audioElement, elementId) {
|
|
534
622
|
if (!this.audioContext) {
|
|
535
623
|
console.warn("AudioContext not available, cannot monitor audio");
|
|
@@ -564,7 +652,9 @@ class AudioElementMonitor {
|
|
|
564
652
|
|
|
565
653
|
console.log(`Started monitoring audio element: ${elementId}`);
|
|
566
654
|
} catch (error) {
|
|
567
|
-
console.error(`Failed to monitor audio element ${elementId}:`, error);
|
|
655
|
+
console.error(`Failed to monitor audio element ${elementId} via analyser:`, error.message);
|
|
656
|
+
console.log(`Falling back to event-based monitoring for ${elementId}`);
|
|
657
|
+
this.monitorViaEvents(audioElement, elementId);
|
|
568
658
|
}
|
|
569
659
|
}
|
|
570
660
|
|
package/package.json
CHANGED
package/src/index.js
CHANGED
|
@@ -109,14 +109,7 @@ function getCompareRequiredParams(argv) {
|
|
|
109
109
|
}
|
|
110
110
|
break;
|
|
111
111
|
case 'elevenlabs':
|
|
112
|
-
|
|
113
|
-
missing.push({
|
|
114
|
-
key: 'branchId',
|
|
115
|
-
flag: '--branch-id',
|
|
116
|
-
description: 'ElevenLabs branch ID',
|
|
117
|
-
hint: 'In the ElevenLabs Dashboard, go to Agents, select your target agent, then click the dropdown next to Publish and select "Copy shareable link". This copies the demo link containing your branch ID.'
|
|
118
|
-
});
|
|
119
|
-
}
|
|
112
|
+
// branchId is optional — the talk-to URL works with just agent_id
|
|
120
113
|
break;
|
|
121
114
|
// retell and others: no extra params needed yet
|
|
122
115
|
}
|
|
@@ -134,8 +127,22 @@ function getCompareTemplateParams(argv) {
|
|
|
134
127
|
switch (argv.provider) {
|
|
135
128
|
case 'vapi':
|
|
136
129
|
return { shareKey: argv.shareKey };
|
|
130
|
+
default:
|
|
131
|
+
return {};
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Get provider-specific extra query parameters to append to the comparison URL.
|
|
137
|
+
* Unlike template params, these are appended as-is (not substituted into {{...}} placeholders).
|
|
138
|
+
*
|
|
139
|
+
* @param {Object} argv - Parsed CLI arguments
|
|
140
|
+
* @returns {Object} Key-value pairs to append as query parameters
|
|
141
|
+
*/
|
|
142
|
+
function getCompareExtraQueryParams(argv) {
|
|
143
|
+
switch (argv.provider) {
|
|
137
144
|
case 'elevenlabs':
|
|
138
|
-
return {
|
|
145
|
+
return argv.branchId ? { branch_id: argv.branchId } : {};
|
|
139
146
|
default:
|
|
140
147
|
return {};
|
|
141
148
|
}
|
|
@@ -295,7 +302,7 @@ const argv = yargs(hideBin(process.argv))
|
|
|
295
302
|
})
|
|
296
303
|
.option('branch-id', {
|
|
297
304
|
type: 'string',
|
|
298
|
-
description: 'ElevenLabs branch ID for direct widget testing (
|
|
305
|
+
description: 'ElevenLabs branch ID for direct widget testing (optional, appended to demo URL when provided)'
|
|
299
306
|
})
|
|
300
307
|
.option('assistant-id', {
|
|
301
308
|
type: 'string',
|
|
@@ -710,7 +717,19 @@ async function main() {
|
|
|
710
717
|
throw new Error(`Provider application config not found: ${providerAppPath}\nPlease create applications/${argv.provider}.yaml for direct provider benchmarking.`);
|
|
711
718
|
}
|
|
712
719
|
|
|
713
|
-
const
|
|
720
|
+
const providerApp = loadApplicationConfig(providerAppPath, providerParams);
|
|
721
|
+
|
|
722
|
+
// Append optional extra query parameters (e.g. branch_id for ElevenLabs)
|
|
723
|
+
const extraQueryParams = getCompareExtraQueryParams(argv);
|
|
724
|
+
if (providerApp.url && Object.keys(extraQueryParams).length > 0) {
|
|
725
|
+
const url = new URL(providerApp.url);
|
|
726
|
+
for (const [key, value] of Object.entries(extraQueryParams)) {
|
|
727
|
+
url.searchParams.set(key, value);
|
|
728
|
+
}
|
|
729
|
+
providerApp.url = url.toString();
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
const providerApplications = [providerApp];
|
|
714
733
|
|
|
715
734
|
const providerResults = await runBenchmark({
|
|
716
735
|
applications: providerApplications,
|
|
@@ -762,7 +781,7 @@ async function main() {
|
|
|
762
781
|
telnyxReportGenerator.generateMetricsSummary();
|
|
763
782
|
|
|
764
783
|
// Generate comparison report
|
|
765
|
-
ReportGenerator.generateComparisonSummary(providerReportGenerator, telnyxReportGenerator, argv.provider);
|
|
784
|
+
ReportGenerator.generateComparisonSummary(providerReportGenerator, telnyxReportGenerator, argv.provider, { debug: argv.debug });
|
|
766
785
|
|
|
767
786
|
// Generate CSV reports if requested
|
|
768
787
|
if (argv.report) {
|
package/src/report.js
CHANGED
|
@@ -28,7 +28,7 @@ export class ReportGenerator {
|
|
|
28
28
|
});
|
|
29
29
|
}
|
|
30
30
|
|
|
31
|
-
recordStepMetric(appName, scenarioName, repetition, stepIndex, action, name, value) {
|
|
31
|
+
recordStepMetric(appName, scenarioName, repetition, stepIndex, action, name, value, scenarioStepIndex = null) {
|
|
32
32
|
const key = this._getRunKey(appName, scenarioName, repetition);
|
|
33
33
|
const run = this.runs.get(key);
|
|
34
34
|
|
|
@@ -52,6 +52,26 @@ export class ReportGenerator {
|
|
|
52
52
|
if (!this.stepColumns.get(stepIndex).has(name)) {
|
|
53
53
|
this.stepColumns.get(stepIndex).set(name, `step_${stepIndex + 1}_${action}_${name}`);
|
|
54
54
|
}
|
|
55
|
+
|
|
56
|
+
// Track scenario step index for cross-provider comparison alignment
|
|
57
|
+
if (scenarioStepIndex !== null) {
|
|
58
|
+
if (!this.scenarioStepMap) {
|
|
59
|
+
this.scenarioStepMap = new Map();
|
|
60
|
+
}
|
|
61
|
+
// Map absolute stepIndex -> scenarioStepIndex (1-based)
|
|
62
|
+
this.scenarioStepMap.set(stepIndex, scenarioStepIndex);
|
|
63
|
+
|
|
64
|
+
// Track scenario-based column names for comparison display
|
|
65
|
+
if (!this.scenarioStepColumns) {
|
|
66
|
+
this.scenarioStepColumns = new Map();
|
|
67
|
+
}
|
|
68
|
+
if (!this.scenarioStepColumns.has(scenarioStepIndex)) {
|
|
69
|
+
this.scenarioStepColumns.set(scenarioStepIndex, new Map());
|
|
70
|
+
}
|
|
71
|
+
if (!this.scenarioStepColumns.get(scenarioStepIndex).has(name)) {
|
|
72
|
+
this.scenarioStepColumns.get(scenarioStepIndex).set(name, `scenario_step_${scenarioStepIndex}_${action}_${name}`);
|
|
73
|
+
}
|
|
74
|
+
}
|
|
55
75
|
}
|
|
56
76
|
|
|
57
77
|
endRun(appName, scenarioName, repetition, success = true) {
|
|
@@ -285,119 +305,178 @@ export class ReportGenerator {
|
|
|
285
305
|
return result;
|
|
286
306
|
}
|
|
287
307
|
|
|
308
|
+
/**
|
|
309
|
+
* Get aggregated metrics keyed by scenario step index for cross-provider comparison.
|
|
310
|
+
* Returns a Map of scenarioStepIndex -> { metricName -> { avg, min, max, p50, columnName } }
|
|
311
|
+
*/
|
|
312
|
+
getAggregatedMetricsByScenarioStep() {
|
|
313
|
+
const result = new Map();
|
|
314
|
+
|
|
315
|
+
// Build reverse map: absolute stepIndex -> scenarioStepIndex
|
|
316
|
+
const scenarioStepMap = this.scenarioStepMap || new Map();
|
|
317
|
+
|
|
318
|
+
// Collect values grouped by scenarioStepIndex
|
|
319
|
+
const grouped = new Map(); // scenarioStepIndex -> metricName -> values[]
|
|
320
|
+
|
|
321
|
+
this.allRunsData.forEach(run => {
|
|
322
|
+
run.stepMetrics.forEach((metrics, stepIndex) => {
|
|
323
|
+
const scenarioIdx = scenarioStepMap.get(stepIndex);
|
|
324
|
+
if (scenarioIdx == null) return; // skip steps without scenario mapping
|
|
325
|
+
|
|
326
|
+
if (!grouped.has(scenarioIdx)) {
|
|
327
|
+
grouped.set(scenarioIdx, new Map());
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
metrics.forEach((value, metricName) => {
|
|
331
|
+
if (!grouped.get(scenarioIdx).has(metricName)) {
|
|
332
|
+
grouped.get(scenarioIdx).set(metricName, []);
|
|
333
|
+
}
|
|
334
|
+
grouped.get(scenarioIdx).get(metricName).push(value);
|
|
335
|
+
});
|
|
336
|
+
});
|
|
337
|
+
});
|
|
338
|
+
|
|
339
|
+
grouped.forEach((metricMap, scenarioIdx) => {
|
|
340
|
+
const stepResult = new Map();
|
|
341
|
+
|
|
342
|
+
metricMap.forEach((values, metricName) => {
|
|
343
|
+
if (values.length > 0) {
|
|
344
|
+
const sum = values.reduce((a, b) => a + b, 0);
|
|
345
|
+
const avg = sum / values.length;
|
|
346
|
+
const min = Math.min(...values);
|
|
347
|
+
const max = Math.max(...values);
|
|
348
|
+
|
|
349
|
+
const sortedValues = [...values].sort((a, b) => a - b);
|
|
350
|
+
let p50;
|
|
351
|
+
if (sortedValues.length % 2 === 0) {
|
|
352
|
+
p50 = (sortedValues[sortedValues.length / 2 - 1] + sortedValues[sortedValues.length / 2]) / 2;
|
|
353
|
+
} else {
|
|
354
|
+
p50 = sortedValues[Math.floor(sortedValues.length / 2)];
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
const columnName = this.scenarioStepColumns?.get(scenarioIdx)?.get(metricName) ||
|
|
358
|
+
`scenario_step_${scenarioIdx}_${metricName}`;
|
|
359
|
+
|
|
360
|
+
stepResult.set(metricName, { avg, min, max, p50, columnName });
|
|
361
|
+
}
|
|
362
|
+
});
|
|
363
|
+
|
|
364
|
+
result.set(scenarioIdx, stepResult);
|
|
365
|
+
});
|
|
366
|
+
|
|
367
|
+
return result;
|
|
368
|
+
}
|
|
369
|
+
|
|
288
370
|
/**
|
|
289
371
|
* Generate a comparison summary between two providers.
|
|
372
|
+
* Aligns metrics by scenario step index so that identical scenario steps
|
|
373
|
+
* are compared regardless of different application setup steps.
|
|
290
374
|
* @param {ReportGenerator} providerReport - Report from the provider benchmark
|
|
291
375
|
* @param {ReportGenerator} telnyxReport - Report from the Telnyx benchmark
|
|
292
376
|
* @param {string} providerName - Name of the external provider
|
|
293
377
|
*/
|
|
294
|
-
static generateComparisonSummary(providerReport, telnyxReport, providerName) {
|
|
378
|
+
static generateComparisonSummary(providerReport, telnyxReport, providerName, { debug = false } = {}) {
|
|
295
379
|
console.log('\n' + '='.repeat(80));
|
|
296
|
-
console.log('📊 COMPARISON
|
|
380
|
+
console.log('📊 COMPARISON: ' + providerName.toUpperCase() + ' vs TELNYX');
|
|
297
381
|
console.log('='.repeat(80));
|
|
298
382
|
|
|
299
|
-
|
|
300
|
-
const
|
|
383
|
+
// Use scenario-step-aligned metrics for comparison
|
|
384
|
+
const providerMetrics = providerReport.getAggregatedMetricsByScenarioStep();
|
|
385
|
+
const telnyxMetrics = telnyxReport.getAggregatedMetricsByScenarioStep();
|
|
301
386
|
|
|
302
|
-
// Find
|
|
303
|
-
const
|
|
387
|
+
// Find matched scenario steps (present in both providers)
|
|
388
|
+
const allScenarioSteps = new Set([
|
|
304
389
|
...providerMetrics.keys(),
|
|
305
390
|
...telnyxMetrics.keys()
|
|
306
391
|
]);
|
|
307
|
-
const sortedIndices = Array.from(
|
|
392
|
+
const sortedIndices = Array.from(allScenarioSteps).sort((a, b) => a - b);
|
|
393
|
+
|
|
394
|
+
// Collect matched latencies
|
|
395
|
+
const providerLatencies = [];
|
|
396
|
+
const telnyxLatencies = [];
|
|
397
|
+
const perResponse = []; // for debug output
|
|
308
398
|
|
|
309
|
-
|
|
310
|
-
|
|
399
|
+
sortedIndices.forEach(scenarioStep => {
|
|
400
|
+
const providerElapsed = providerMetrics.get(scenarioStep)?.get('elapsed_time');
|
|
401
|
+
const telnyxElapsed = telnyxMetrics.get(scenarioStep)?.get('elapsed_time');
|
|
402
|
+
|
|
403
|
+
if (providerElapsed && telnyxElapsed) {
|
|
404
|
+
providerLatencies.push(providerElapsed.avg);
|
|
405
|
+
telnyxLatencies.push(telnyxElapsed.avg);
|
|
406
|
+
perResponse.push({
|
|
407
|
+
providerAvg: providerElapsed.avg,
|
|
408
|
+
telnyxAvg: telnyxElapsed.avg,
|
|
409
|
+
columnName: providerElapsed.columnName || telnyxElapsed.columnName
|
|
410
|
+
});
|
|
411
|
+
}
|
|
412
|
+
});
|
|
413
|
+
|
|
414
|
+
if (providerLatencies.length === 0) {
|
|
415
|
+
console.log('\n⚠️ No comparable metrics found between providers.');
|
|
416
|
+
console.log('='.repeat(80));
|
|
311
417
|
return;
|
|
312
418
|
}
|
|
313
419
|
|
|
314
|
-
//
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
const
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
if (providerElapsed && telnyxElapsed) {
|
|
344
|
-
const diff = telnyxElapsed.avg - providerElapsed.avg;
|
|
345
|
-
const pct = ((diff / providerElapsed.avg) * 100).toFixed(1);
|
|
346
|
-
delta = diff > 0 ? `+${Math.round(diff)}ms` : `${Math.round(diff)}ms`;
|
|
347
|
-
|
|
348
|
-
if (Math.abs(diff) < 50) {
|
|
349
|
-
winner = '≈ Tie';
|
|
350
|
-
} else if (diff < 0) {
|
|
351
|
-
winner = '🏆 Telnyx';
|
|
352
|
-
} else {
|
|
353
|
-
winner = `🏆 ${providerName}`;
|
|
354
|
-
}
|
|
355
|
-
delta += ` (${pct}%)`;
|
|
420
|
+
// Debug: show per-response breakdown
|
|
421
|
+
if (debug && perResponse.length > 0) {
|
|
422
|
+
console.log('\n📈 Per-response breakdown:');
|
|
423
|
+
console.log('-'.repeat(80));
|
|
424
|
+
console.log(
|
|
425
|
+
'Response'.padEnd(40) +
|
|
426
|
+
providerName.padEnd(12) +
|
|
427
|
+
'Telnyx'.padEnd(12) +
|
|
428
|
+
'Delta'.padEnd(16) +
|
|
429
|
+
'Winner'
|
|
430
|
+
);
|
|
431
|
+
console.log('-'.repeat(80));
|
|
432
|
+
|
|
433
|
+
perResponse.forEach((r, i) => {
|
|
434
|
+
const action = (r.columnName || '').replace(/^scenario_step_\d+_/, '');
|
|
435
|
+
const label = `#${i + 1} (${action})`;
|
|
436
|
+
const shortLabel = label.length > 38 ? label.substring(0, 35) + '...' : label;
|
|
437
|
+
|
|
438
|
+
const diff = r.telnyxAvg - r.providerAvg;
|
|
439
|
+
const pct = ((diff / r.providerAvg) * 100).toFixed(1);
|
|
440
|
+
const delta = `${diff > 0 ? '+' : ''}${Math.round(diff)}ms (${pct}%)`;
|
|
441
|
+
|
|
442
|
+
let winner;
|
|
443
|
+
if (Math.abs(diff) < 50) {
|
|
444
|
+
winner = '≈ Tie';
|
|
445
|
+
} else if (diff < 0) {
|
|
446
|
+
winner = '🏆 Telnyx';
|
|
447
|
+
} else {
|
|
448
|
+
winner = `🏆 ${providerName}`;
|
|
356
449
|
}
|
|
357
450
|
|
|
358
451
|
console.log(
|
|
359
|
-
|
|
360
|
-
`${providerAvg}ms`.padEnd(12) +
|
|
361
|
-
`${telnyxAvg}ms`.padEnd(12) +
|
|
362
|
-
delta.padEnd(
|
|
452
|
+
shortLabel.padEnd(40) +
|
|
453
|
+
`${Math.round(r.providerAvg)}ms`.padEnd(12) +
|
|
454
|
+
`${Math.round(r.telnyxAvg)}ms`.padEnd(12) +
|
|
455
|
+
delta.padEnd(16) +
|
|
363
456
|
winner
|
|
364
457
|
);
|
|
365
|
-
}
|
|
366
|
-
});
|
|
367
|
-
|
|
368
|
-
console.log('-'.repeat(80));
|
|
458
|
+
});
|
|
369
459
|
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
sortedIndices.forEach(stepIndex => {
|
|
373
|
-
const providerStep = providerMetrics.get(stepIndex);
|
|
374
|
-
const telnyxStep = telnyxMetrics.get(stepIndex);
|
|
375
|
-
const providerElapsed = providerStep?.get('elapsed_time');
|
|
376
|
-
const telnyxElapsed = telnyxStep?.get('elapsed_time');
|
|
377
|
-
|
|
378
|
-
if (providerElapsed && telnyxElapsed) {
|
|
379
|
-
providerTotal += providerElapsed.avg;
|
|
380
|
-
telnyxTotal += telnyxElapsed.avg;
|
|
381
|
-
comparableSteps++;
|
|
382
|
-
}
|
|
383
|
-
});
|
|
460
|
+
console.log('-'.repeat(80));
|
|
461
|
+
}
|
|
384
462
|
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
463
|
+
// One headline number: average response latency
|
|
464
|
+
const providerAvg = providerLatencies.reduce((a, b) => a + b, 0) / providerLatencies.length;
|
|
465
|
+
const telnyxAvg = telnyxLatencies.reduce((a, b) => a + b, 0) / telnyxLatencies.length;
|
|
466
|
+
const diff = telnyxAvg - providerAvg;
|
|
467
|
+
const pct = ((diff / providerAvg) * 100).toFixed(1);
|
|
468
|
+
|
|
469
|
+
console.log(`\n Average response latency (${providerLatencies.length} matched responses):\n`);
|
|
470
|
+
console.log(` ${providerName.padEnd(16)} ${Math.round(providerAvg)}ms`);
|
|
471
|
+
console.log(` ${'Telnyx'.padEnd(16)} ${Math.round(telnyxAvg)}ms`);
|
|
472
|
+
console.log(` ${'Difference'.padEnd(16)} ${diff > 0 ? '+' : ''}${Math.round(diff)}ms (${pct}%)`);
|
|
473
|
+
|
|
474
|
+
if (Math.abs(diff) < 50) {
|
|
475
|
+
console.log('\n 🤝 Result: Both providers perform similarly');
|
|
476
|
+
} else if (diff < 0) {
|
|
477
|
+
console.log(`\n 🏆 Telnyx is ${Math.abs(pct)}% faster`);
|
|
478
|
+
} else {
|
|
479
|
+
console.log(`\n 🏆 ${providerName} is ${Math.abs(pct)}% faster`);
|
|
401
480
|
}
|
|
402
481
|
|
|
403
482
|
console.log('\n' + '='.repeat(80));
|
|
@@ -490,7 +490,7 @@ export class VoiceAgentTester {
|
|
|
490
490
|
}
|
|
491
491
|
}
|
|
492
492
|
|
|
493
|
-
async executeStep(step, stepIndex, appName = '', scenarioName = '', repetition = 1) {
|
|
493
|
+
async executeStep(step, stepIndex, appName = '', scenarioName = '', repetition = 1, scenarioStepIndex = null) {
|
|
494
494
|
if (!this.page) {
|
|
495
495
|
throw new Error('Browser not launched. Call launch() first.');
|
|
496
496
|
}
|
|
@@ -553,13 +553,13 @@ export class VoiceAgentTester {
|
|
|
553
553
|
// Record metrics for report if enabled and step has metrics attribute
|
|
554
554
|
if (this.reportGenerator && step.metrics) {
|
|
555
555
|
if (step.metrics.includes('elapsed_time')) {
|
|
556
|
-
this.reportGenerator.recordStepMetric(appName, scenarioName, repetition, stepIndex, step.action, 'elapsed_time', elapsedTimeMs);
|
|
556
|
+
this.reportGenerator.recordStepMetric(appName, scenarioName, repetition, stepIndex, step.action, 'elapsed_time', elapsedTimeMs, scenarioStepIndex);
|
|
557
557
|
}
|
|
558
558
|
// Record any additional metrics returned by the handler
|
|
559
559
|
if (handlerResult && typeof handlerResult === 'object') {
|
|
560
560
|
for (const [metricName, metricValue] of Object.entries(handlerResult)) {
|
|
561
561
|
if (step.metrics.includes(metricName)) {
|
|
562
|
-
this.reportGenerator.recordStepMetric(appName, scenarioName, repetition, stepIndex, step.action, metricName, metricValue);
|
|
562
|
+
this.reportGenerator.recordStepMetric(appName, scenarioName, repetition, stepIndex, step.action, metricName, metricValue, scenarioStepIndex);
|
|
563
563
|
}
|
|
564
564
|
}
|
|
565
565
|
}
|
|
@@ -1266,10 +1266,14 @@ export class VoiceAgentTester {
|
|
|
1266
1266
|
}
|
|
1267
1267
|
|
|
1268
1268
|
// Execute all configured steps
|
|
1269
|
+
const appStepCount = appSteps.length;
|
|
1269
1270
|
for (let i = 0; i < steps.length; i++) {
|
|
1270
1271
|
const step = steps[i];
|
|
1271
1272
|
console.log(`Executing step ${i + 1}: ${JSON.stringify(step)}`);
|
|
1272
|
-
|
|
1273
|
+
// For scenario steps (after app steps), pass the 1-based scenario step index
|
|
1274
|
+
// so metrics can be aligned across providers with different app setup steps
|
|
1275
|
+
const scenarioStepIndex = i >= appStepCount ? (i - appStepCount + 1) : null;
|
|
1276
|
+
await this.executeStep(step, i, appName, scenarioName, repetition, scenarioStepIndex);
|
|
1273
1277
|
}
|
|
1274
1278
|
|
|
1275
1279
|
// Keep the browser open for a bit after all steps
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { describe, test, expect, beforeEach, afterEach } from '@jest/globals';
|
|
2
2
|
import { VoiceAgentTester } from '../src/voice-agent-tester.js';
|
|
3
|
+
import { ReportGenerator } from '../src/report.js';
|
|
3
4
|
import fs from 'fs';
|
|
4
5
|
import path from 'path';
|
|
5
6
|
|
|
@@ -187,4 +188,136 @@ describe('VoiceAgentTester', () => {
|
|
|
187
188
|
await expect(tester.executeStep({ action: 'speak' }, 0, 'scenario'))
|
|
188
189
|
.rejects.toThrow('No text or file specified for speak action');
|
|
189
190
|
});
|
|
191
|
+
});
|
|
192
|
+
|
|
193
|
+
describe('ReportGenerator - Comparison Step Alignment', () => {
|
|
194
|
+
test('should align metrics by scenario step index across providers with different app steps', () => {
|
|
195
|
+
// Simulate: Vapi has 5 app steps, Telnyx has 3 app steps
|
|
196
|
+
// Both share the same 7 scenario steps with metrics on scenario steps 4 and 7
|
|
197
|
+
const providerReport = new ReportGenerator('/tmp/test_provider.csv');
|
|
198
|
+
const telnyxReport = new ReportGenerator('/tmp/test_telnyx.csv');
|
|
199
|
+
|
|
200
|
+
// Provider (Vapi): 5 app steps + 7 scenario steps = 12 total
|
|
201
|
+
// Metric steps at absolute indices 8 (scenario step 4) and 11 (scenario step 7)
|
|
202
|
+
providerReport.beginRun('vapi', 'appointment', 0);
|
|
203
|
+
providerReport.recordStepMetric('vapi', 'appointment', 0, 8, 'wait_for_voice', 'elapsed_time', 2849, 4);
|
|
204
|
+
providerReport.recordStepMetric('vapi', 'appointment', 0, 11, 'wait_for_voice', 'elapsed_time', 3307, 7);
|
|
205
|
+
providerReport.endRun('vapi', 'appointment', 0);
|
|
206
|
+
|
|
207
|
+
// Telnyx: 3 app steps + 7 scenario steps = 10 total
|
|
208
|
+
// Metric steps at absolute indices 6 (scenario step 4) and 9 (scenario step 7)
|
|
209
|
+
telnyxReport.beginRun('telnyx', 'appointment', 0);
|
|
210
|
+
telnyxReport.recordStepMetric('telnyx', 'appointment', 0, 6, 'wait_for_voice', 'elapsed_time', 1552, 4);
|
|
211
|
+
telnyxReport.recordStepMetric('telnyx', 'appointment', 0, 9, 'wait_for_voice', 'elapsed_time', 704, 7);
|
|
212
|
+
telnyxReport.endRun('telnyx', 'appointment', 0);
|
|
213
|
+
|
|
214
|
+
// Get scenario-aligned metrics
|
|
215
|
+
const providerMetrics = providerReport.getAggregatedMetricsByScenarioStep();
|
|
216
|
+
const telnyxMetrics = telnyxReport.getAggregatedMetricsByScenarioStep();
|
|
217
|
+
|
|
218
|
+
// Both should have metrics at scenario steps 4 and 7
|
|
219
|
+
expect(providerMetrics.has(4)).toBe(true);
|
|
220
|
+
expect(providerMetrics.has(7)).toBe(true);
|
|
221
|
+
expect(telnyxMetrics.has(4)).toBe(true);
|
|
222
|
+
expect(telnyxMetrics.has(7)).toBe(true);
|
|
223
|
+
|
|
224
|
+
// Verify values are correct
|
|
225
|
+
expect(providerMetrics.get(4).get('elapsed_time').avg).toBe(2849);
|
|
226
|
+
expect(providerMetrics.get(7).get('elapsed_time').avg).toBe(3307);
|
|
227
|
+
expect(telnyxMetrics.get(4).get('elapsed_time').avg).toBe(1552);
|
|
228
|
+
expect(telnyxMetrics.get(7).get('elapsed_time').avg).toBe(704);
|
|
229
|
+
|
|
230
|
+
// The comparison should now have 2 comparable steps (not 4 separate unmatched ones)
|
|
231
|
+
const allScenarioSteps = new Set([
|
|
232
|
+
...providerMetrics.keys(),
|
|
233
|
+
...telnyxMetrics.keys()
|
|
234
|
+
]);
|
|
235
|
+
expect(allScenarioSteps.size).toBe(2);
|
|
236
|
+
});
|
|
237
|
+
|
|
238
|
+
test('should generate comparison summary with single headline number', () => {
|
|
239
|
+
const providerReport = new ReportGenerator('/tmp/test_provider.csv');
|
|
240
|
+
const telnyxReport = new ReportGenerator('/tmp/test_telnyx.csv');
|
|
241
|
+
|
|
242
|
+
providerReport.beginRun('vapi', 'appointment', 0);
|
|
243
|
+
providerReport.recordStepMetric('vapi', 'appointment', 0, 8, 'wait_for_voice', 'elapsed_time', 2849, 4);
|
|
244
|
+
providerReport.recordStepMetric('vapi', 'appointment', 0, 11, 'wait_for_voice', 'elapsed_time', 3307, 7);
|
|
245
|
+
providerReport.endRun('vapi', 'appointment', 0);
|
|
246
|
+
|
|
247
|
+
telnyxReport.beginRun('telnyx', 'appointment', 0);
|
|
248
|
+
telnyxReport.recordStepMetric('telnyx', 'appointment', 0, 6, 'wait_for_voice', 'elapsed_time', 1552, 4);
|
|
249
|
+
telnyxReport.recordStepMetric('telnyx', 'appointment', 0, 9, 'wait_for_voice', 'elapsed_time', 704, 7);
|
|
250
|
+
telnyxReport.endRun('telnyx', 'appointment', 0);
|
|
251
|
+
|
|
252
|
+
// Capture console output
|
|
253
|
+
const logs = [];
|
|
254
|
+
const originalLog = console.log;
|
|
255
|
+
console.log = (msg) => logs.push(msg);
|
|
256
|
+
|
|
257
|
+
ReportGenerator.generateComparisonSummary(providerReport, telnyxReport, 'vapi');
|
|
258
|
+
|
|
259
|
+
console.log = originalLog;
|
|
260
|
+
|
|
261
|
+
const output = logs.join('\n');
|
|
262
|
+
|
|
263
|
+
// Should show averaged headline numbers: vapi avg = (2849+3307)/2 = 3078, telnyx avg = (1552+704)/2 = 1128
|
|
264
|
+
expect(output).toContain('3078ms');
|
|
265
|
+
expect(output).toContain('1128ms');
|
|
266
|
+
// Should show "2 matched responses"
|
|
267
|
+
expect(output).toContain('2 matched responses');
|
|
268
|
+
// Should declare Telnyx the winner
|
|
269
|
+
expect(output).toContain('🏆 Telnyx');
|
|
270
|
+
// Should NOT contain per-response breakdown without debug
|
|
271
|
+
expect(output).not.toContain('Per-response breakdown');
|
|
272
|
+
expect(output).not.toContain('#1');
|
|
273
|
+
expect(output).not.toContain('#2');
|
|
274
|
+
});
|
|
275
|
+
|
|
276
|
+
test('should show per-response breakdown with debug flag', () => {
|
|
277
|
+
const providerReport = new ReportGenerator('/tmp/test_provider.csv');
|
|
278
|
+
const telnyxReport = new ReportGenerator('/tmp/test_telnyx.csv');
|
|
279
|
+
|
|
280
|
+
providerReport.beginRun('vapi', 'appointment', 0);
|
|
281
|
+
providerReport.recordStepMetric('vapi', 'appointment', 0, 8, 'wait_for_voice', 'elapsed_time', 2849, 4);
|
|
282
|
+
providerReport.recordStepMetric('vapi', 'appointment', 0, 11, 'wait_for_voice', 'elapsed_time', 3307, 7);
|
|
283
|
+
providerReport.endRun('vapi', 'appointment', 0);
|
|
284
|
+
|
|
285
|
+
telnyxReport.beginRun('telnyx', 'appointment', 0);
|
|
286
|
+
telnyxReport.recordStepMetric('telnyx', 'appointment', 0, 6, 'wait_for_voice', 'elapsed_time', 1552, 4);
|
|
287
|
+
telnyxReport.recordStepMetric('telnyx', 'appointment', 0, 9, 'wait_for_voice', 'elapsed_time', 704, 7);
|
|
288
|
+
telnyxReport.endRun('telnyx', 'appointment', 0);
|
|
289
|
+
|
|
290
|
+
const logs = [];
|
|
291
|
+
const originalLog = console.log;
|
|
292
|
+
console.log = (msg) => logs.push(msg);
|
|
293
|
+
|
|
294
|
+
ReportGenerator.generateComparisonSummary(providerReport, telnyxReport, 'vapi', { debug: true });
|
|
295
|
+
|
|
296
|
+
console.log = originalLog;
|
|
297
|
+
|
|
298
|
+
const output = logs.join('\n');
|
|
299
|
+
|
|
300
|
+
// Should contain per-response breakdown
|
|
301
|
+
expect(output).toContain('Per-response breakdown');
|
|
302
|
+
expect(output).toContain('#1');
|
|
303
|
+
expect(output).toContain('#2');
|
|
304
|
+
expect(output).toContain('2849ms');
|
|
305
|
+
expect(output).toContain('1552ms');
|
|
306
|
+
expect(output).toContain('3307ms');
|
|
307
|
+
expect(output).toContain('704ms');
|
|
308
|
+
// Should ALSO contain the headline average
|
|
309
|
+
expect(output).toContain('3078ms');
|
|
310
|
+
expect(output).toContain('1128ms');
|
|
311
|
+
});
|
|
312
|
+
|
|
313
|
+
test('getAggregatedMetricsByScenarioStep returns empty map when no scenario steps', () => {
|
|
314
|
+
const report = new ReportGenerator('/tmp/test.csv');
|
|
315
|
+
report.beginRun('test', 'scenario', 0);
|
|
316
|
+
// Record without scenarioStepIndex (app step)
|
|
317
|
+
report.recordStepMetric('test', 'scenario', 0, 0, 'click', 'elapsed_time', 100);
|
|
318
|
+
report.endRun('test', 'scenario', 0);
|
|
319
|
+
|
|
320
|
+
const metrics = report.getAggregatedMetricsByScenarioStep();
|
|
321
|
+
expect(metrics.size).toBe(0);
|
|
322
|
+
});
|
|
190
323
|
});
|