voxarena 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- voxarena-0.1.0/LICENSE +21 -0
- voxarena-0.1.0/PKG-INFO +243 -0
- voxarena-0.1.0/README.md +191 -0
- voxarena-0.1.0/pyproject.toml +54 -0
- voxarena-0.1.0/setup.cfg +4 -0
- voxarena-0.1.0/voxarena/agent.py +91 -0
- voxarena-0.1.0/voxarena/cli.py +312 -0
- voxarena-0.1.0/voxarena/config.py +58 -0
- voxarena-0.1.0/voxarena/database.py +328 -0
- voxarena-0.1.0/voxarena/evaluator.py +150 -0
- voxarena-0.1.0/voxarena/generate_audio.py +145 -0
- voxarena-0.1.0/voxarena/harness.py +385 -0
- voxarena-0.1.0/voxarena/main.py +357 -0
- voxarena-0.1.0/voxarena/manifest.py +111 -0
- voxarena-0.1.0/voxarena/providers/__init__.py +71 -0
- voxarena-0.1.0/voxarena/providers/base.py +214 -0
- voxarena-0.1.0/voxarena/providers/gemini.py +31 -0
- voxarena-0.1.0/voxarena/providers/openai.py +44 -0
- voxarena-0.1.0/voxarena/report_generator.py +126 -0
- voxarena-0.1.0/voxarena/runner.py +140 -0
- voxarena-0.1.0/voxarena/tools.py +172 -0
- voxarena-0.1.0/voxarena.egg-info/PKG-INFO +243 -0
- voxarena-0.1.0/voxarena.egg-info/SOURCES.txt +25 -0
- voxarena-0.1.0/voxarena.egg-info/dependency_links.txt +1 -0
- voxarena-0.1.0/voxarena.egg-info/entry_points.txt +2 -0
- voxarena-0.1.0/voxarena.egg-info/requires.txt +11 -0
- voxarena-0.1.0/voxarena.egg-info/top_level.txt +1 -0
voxarena-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Simkeyur
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
voxarena-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: voxarena
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: An evaluation arena for realtime voice agents.
|
|
5
|
+
Author: VoxArena contributors
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 Simkeyur
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://github.com/simkeyur/vox-arena
|
|
29
|
+
Project-URL: Issues, https://github.com/simkeyur/vox-arena/issues
|
|
30
|
+
Keywords: voice-agents,realtime-llm,evaluation,benchmarking,gemini-live,openai-realtime,pipecat
|
|
31
|
+
Classifier: Development Status :: 3 - Alpha
|
|
32
|
+
Classifier: Intended Audience :: Developers
|
|
33
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
34
|
+
Classifier: Programming Language :: Python :: 3
|
|
35
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
36
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
37
|
+
Classifier: Topic :: Software Development :: Testing
|
|
38
|
+
Requires-Python: >=3.11
|
|
39
|
+
Description-Content-Type: text/markdown
|
|
40
|
+
License-File: LICENSE
|
|
41
|
+
Requires-Dist: fastapi>=0.110.0
|
|
42
|
+
Requires-Dist: uvicorn>=0.28.0
|
|
43
|
+
Requires-Dist: pydantic>=2.6.0
|
|
44
|
+
Requires-Dist: pydantic-settings>=2.2.0
|
|
45
|
+
Requires-Dist: pipecat-ai[google,openai]>=0.5.0
|
|
46
|
+
Requires-Dist: python-dotenv>=1.0.1
|
|
47
|
+
Requires-Dist: pyyaml>=6.0.1
|
|
48
|
+
Requires-Dist: loguru>=0.7.2
|
|
49
|
+
Provides-Extra: dev
|
|
50
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
51
|
+
Dynamic: license-file
|
|
52
|
+
|
|
53
|
+
<p align="center">
|
|
54
|
+
<picture>
|
|
55
|
+
<source media="(prefers-color-scheme: dark)" srcset="ui/src/assets/logo-dark.png" />
|
|
56
|
+
<img src="ui/src/assets/logo.png" alt="VoxArena" width="220" />
|
|
57
|
+
</picture>
|
|
58
|
+
</p>
|
|
59
|
+
|
|
60
|
+
<p align="center"><em>An evaluation arena for realtime voice agents.</em></p>
|
|
61
|
+
|
|
62
|
+
<p align="center">
|
|
63
|
+
|
|
64
|
+
[](LICENSE)
|
|
65
|
+
[](https://www.python.org/downloads/)
|
|
66
|
+
[](https://github.com/pipecat-ai/pipecat)
|
|
67
|
+
[](#contributing)
|
|
68
|
+
|
|
69
|
+
</p>
|
|
70
|
+
|
|
71
|
+
VoxArena is a reproducible benchmarking harness for realtime voice agents. Run the same scripted conversation across Gemini Live, OpenAI Realtime, and other [Pipecat](https://github.com/pipecat-ai/pipecat)-supported providers — and compare them apples-to-apples on latency, tool-call accuracy, and hallucinations.
|
|
72
|
+
|
|
73
|
+
Drop it into your CI pipeline, your dev loop, or the bundled control panel.
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## 🚀 CI & Pipeline Integration
|
|
78
|
+
|
|
79
|
+
VoxArena ships a `voxarena` CLI designed for headless use in your build pipeline. It returns a non-zero exit code when metrics fall below thresholds you define, and emits JUnit XML for native CI reporting.
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
pip install voxarena
|
|
83
|
+
|
|
84
|
+
voxarena run \
|
|
85
|
+
--provider gemini \
|
|
86
|
+
--script ./script/utterances.yaml \
|
|
87
|
+
--min-tool-accuracy 0.9 \
|
|
88
|
+
--max-hallucinations 0 \
|
|
89
|
+
--max-avg-ttfa-ms 1500 \
|
|
90
|
+
--output result.json \
|
|
91
|
+
--junit voxarena.xml
|
|
92
|
+
# exit 0 if every threshold passes, 1 otherwise
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Compare two providers in one shot
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
voxarena compare \
|
|
99
|
+
--gemini-model gemini-3.1-flash-live-preview \
|
|
100
|
+
--openai-model gpt-realtime-2 \
|
|
101
|
+
--num-turns 5 \
|
|
102
|
+
--min-tool-accuracy 0.9 \
|
|
103
|
+
--output compare.json
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### GitHub Actions
|
|
107
|
+
|
|
108
|
+
```yaml
|
|
109
|
+
- name: Voice agent regression check
|
|
110
|
+
env:
|
|
111
|
+
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
|
|
112
|
+
run: |
|
|
113
|
+
pip install voxarena
|
|
114
|
+
voxarena run --provider gemini \
|
|
115
|
+
--min-tool-accuracy 0.92 --max-hallucinations 0 \
|
|
116
|
+
--junit voxarena.xml --quiet
|
|
117
|
+
|
|
118
|
+
- uses: mikepenz/action-junit-report@v4
|
|
119
|
+
if: always()
|
|
120
|
+
with:
|
|
121
|
+
report_paths: voxarena.xml
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Subcommands
|
|
125
|
+
|
|
126
|
+
| Command | What it does |
|
|
127
|
+
| --- | --- |
|
|
128
|
+
| `voxarena run` | Single-provider scripted run; exits 0/1 against thresholds. |
|
|
129
|
+
| `voxarena compare` | Runs Gemini and OpenAI in parallel against the same script. |
|
|
130
|
+
| `voxarena report` | Generates a markdown comparison report from past runs. |
|
|
131
|
+
|
|
132
|
+
Run `voxarena <command> --help` for the full flag set.
|
|
133
|
+
|
|
134
|
+
---
|
|
135
|
+
|
|
136
|
+
## Features
|
|
137
|
+
|
|
138
|
+
- 🎙️ **Provider-agnostic agent** — one Pipecat pipeline drives every provider; swap models without re-implementing your agent
|
|
139
|
+
- 🔁 **Scripted conversations** — multi-turn YAML scripts with pre-recorded WAV inputs and expected tool calls / response content
|
|
140
|
+
- 📊 **Automated scoring** — tool-call correctness, response matching, hallucination counts, time-to-first-audio, interruption-stop latency
|
|
141
|
+
- 🆚 **Side-by-side comparisons** — run multiple providers in parallel against the same script
|
|
142
|
+
- 🗄️ **Persistent run history** — JSON manifests on disk, indexed in SQLite
|
|
143
|
+
- 🖥️ **Web control panel** — React UI to launch runs, watch live status, browse results, and edit scripts
|
|
144
|
+
- 🧩 **Extensible** — add a new provider by implementing one adapter class
|
|
145
|
+
|
|
146
|
+
## Architecture
|
|
147
|
+
|
|
148
|
+
```mermaid
|
|
149
|
+
flowchart TD
|
|
150
|
+
A["Recorded WAVs<br/>script/audio/*.wav"] --> B["Injection Harness<br/>voxarena/harness.py"]
|
|
151
|
+
B --> C
|
|
152
|
+
|
|
153
|
+
subgraph C ["Pipecat Pipeline"]
|
|
154
|
+
direction LR
|
|
155
|
+
C1["Audio Injector"] --> C2["Provider Adapter"]
|
|
156
|
+
C2 --> C3["Audio Capture"]
|
|
157
|
+
C3 --> C4["Metrics Collector"]
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
C2 <--> D{{"Provider Backend"}}
|
|
161
|
+
D --> D1["Gemini Live"]
|
|
162
|
+
D --> D2["OpenAI Realtime"]
|
|
163
|
+
D --> D3["...future providers"]
|
|
164
|
+
|
|
165
|
+
C4 --> E["Run Manifest<br/>results/PROVIDER/RUN_ID/manifest.json"]
|
|
166
|
+
E --> F[("SQLite Index<br/>runs.db")]
|
|
167
|
+
|
|
168
|
+
F <--> G["voxarena CLI<br/>+ FastAPI Backend"]
|
|
169
|
+
G <--> H["React Control Panel<br/>ui/"]
|
|
170
|
+
|
|
171
|
+
style D1 fill:#4285F4,color:#fff,stroke:#333
|
|
172
|
+
style D2 fill:#10A37F,color:#fff,stroke:#333
|
|
173
|
+
style D3 fill:#999,color:#fff,stroke:#333
|
|
174
|
+
style F fill:#f5f5f5,stroke:#333
|
|
175
|
+
style H fill:#fff7da,stroke:#333
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
## Local Dev (with UI)
|
|
179
|
+
|
|
180
|
+
```bash
|
|
181
|
+
git clone https://github.com/simkeyur/vox-arena.git
|
|
182
|
+
cd vox-arena
|
|
183
|
+
cp .env.example .env # add GOOGLE_API_KEY / OPENAI_API_KEY
|
|
184
|
+
|
|
185
|
+
python3 -m venv .venv && source .venv/bin/activate
|
|
186
|
+
pip install -e .
|
|
187
|
+
|
|
188
|
+
uvicorn voxarena.main:app --reload --port 8000
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
Then in another terminal:
|
|
192
|
+
|
|
193
|
+
```bash
|
|
194
|
+
cd ui && npm install && npm run dev
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
Open the control panel at `http://localhost:5173`.
|
|
198
|
+
|
|
199
|
+
## Bring Your Own Agent
|
|
200
|
+
|
|
201
|
+
The demo ships with the "Saffron Leaf" restaurant agent so you can run end-to-end on day one. To evaluate your own:
|
|
202
|
+
|
|
203
|
+
1. Replace the system prompt and tool schemas in `voxarena/agent.py`
|
|
204
|
+
2. Implement (or stub) your tools in `voxarena/tools.py`
|
|
205
|
+
3. Re-record `script/audio/*.wav` and update `script/utterances.yaml` to reflect your real workload
|
|
206
|
+
4. Run the arena as normal — every provider gets scored against your scripts
|
|
207
|
+
|
|
208
|
+
## Scripted Conversations
|
|
209
|
+
|
|
210
|
+
Conversations live in [`script/utterances.yaml`](script/utterances.yaml). Each turn pairs an utterance id with an `expect` block describing the correct tool call and/or response content:
|
|
211
|
+
|
|
212
|
+
```yaml
|
|
213
|
+
- id: u04
|
|
214
|
+
text: "Are you open on Sundays?"
|
|
215
|
+
expect:
|
|
216
|
+
tool: get_hours
|
|
217
|
+
args:
|
|
218
|
+
day: sunday
|
|
219
|
+
response_contains:
|
|
220
|
+
- "closed"
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
The harness plays `script/audio/{id}.wav` into the pipeline and scores the agent's actual tool calls and transcript against `expect`.
|
|
224
|
+
|
|
225
|
+
## Configuration
|
|
226
|
+
|
|
227
|
+
| Variable | Description |
|
|
228
|
+
| --- | --- |
|
|
229
|
+
| `GOOGLE_API_KEY` / `OPENAI_API_KEY` | Provider credentials |
|
|
230
|
+
| `GEMINI_MODEL` / `OPENAI_MODEL` | Realtime model under test |
|
|
231
|
+
| `GEMINI_EVAL_MODEL` / `OPENAI_EVAL_MODEL` | Cheaper text models for grading |
|
|
232
|
+
| `PORT` | FastAPI server port |
|
|
233
|
+
| `BASE_DIR` | Override workdir (CLI: `--workdir`) |
|
|
234
|
+
|
|
235
|
+
## Contributing
|
|
236
|
+
|
|
237
|
+
To add a new provider: implement an adapter in `voxarena/providers/` following the pattern in `gemini.py` / `openai.py`, wire it into `voxarena/harness.py` and `voxarena/config.py`, and open a PR.
|
|
238
|
+
|
|
239
|
+
For bugs and feature requests, please open an issue.
|
|
240
|
+
|
|
241
|
+
## License
|
|
242
|
+
|
|
243
|
+
[MIT](LICENSE).
|
voxarena-0.1.0/README.md
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<picture>
|
|
3
|
+
<source media="(prefers-color-scheme: dark)" srcset="ui/src/assets/logo-dark.png" />
|
|
4
|
+
<img src="ui/src/assets/logo.png" alt="VoxArena" width="220" />
|
|
5
|
+
</picture>
|
|
6
|
+
</p>
|
|
7
|
+
|
|
8
|
+
<p align="center"><em>An evaluation arena for realtime voice agents.</em></p>
|
|
9
|
+
|
|
10
|
+
<p align="center">
|
|
11
|
+
|
|
12
|
+
[](LICENSE)
|
|
13
|
+
[](https://www.python.org/downloads/)
|
|
14
|
+
[](https://github.com/pipecat-ai/pipecat)
|
|
15
|
+
[](#contributing)
|
|
16
|
+
|
|
17
|
+
</p>
|
|
18
|
+
|
|
19
|
+
VoxArena is a reproducible benchmarking harness for realtime voice agents. Run the same scripted conversation across Gemini Live, OpenAI Realtime, and other [Pipecat](https://github.com/pipecat-ai/pipecat)-supported providers — and compare them apples-to-apples on latency, tool-call accuracy, and hallucinations.
|
|
20
|
+
|
|
21
|
+
Drop it into your CI pipeline, your dev loop, or the bundled control panel.
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## 🚀 CI & Pipeline Integration
|
|
26
|
+
|
|
27
|
+
VoxArena ships a `voxarena` CLI designed for headless use in your build pipeline. It returns a non-zero exit code when metrics fall below thresholds you define, and emits JUnit XML for native CI reporting.
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
pip install voxarena
|
|
31
|
+
|
|
32
|
+
voxarena run \
|
|
33
|
+
--provider gemini \
|
|
34
|
+
--script ./script/utterances.yaml \
|
|
35
|
+
--min-tool-accuracy 0.9 \
|
|
36
|
+
--max-hallucinations 0 \
|
|
37
|
+
--max-avg-ttfa-ms 1500 \
|
|
38
|
+
--output result.json \
|
|
39
|
+
--junit voxarena.xml
|
|
40
|
+
# exit 0 if every threshold passes, 1 otherwise
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### Compare two providers in one shot
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
voxarena compare \
|
|
47
|
+
--gemini-model gemini-3.1-flash-live-preview \
|
|
48
|
+
--openai-model gpt-realtime-2 \
|
|
49
|
+
--num-turns 5 \
|
|
50
|
+
--min-tool-accuracy 0.9 \
|
|
51
|
+
--output compare.json
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### GitHub Actions
|
|
55
|
+
|
|
56
|
+
```yaml
|
|
57
|
+
- name: Voice agent regression check
|
|
58
|
+
env:
|
|
59
|
+
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
|
|
60
|
+
run: |
|
|
61
|
+
pip install voxarena
|
|
62
|
+
voxarena run --provider gemini \
|
|
63
|
+
--min-tool-accuracy 0.92 --max-hallucinations 0 \
|
|
64
|
+
--junit voxarena.xml --quiet
|
|
65
|
+
|
|
66
|
+
- uses: mikepenz/action-junit-report@v4
|
|
67
|
+
if: always()
|
|
68
|
+
with:
|
|
69
|
+
report_paths: voxarena.xml
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Subcommands
|
|
73
|
+
|
|
74
|
+
| Command | What it does |
|
|
75
|
+
| --- | --- |
|
|
76
|
+
| `voxarena run` | Single-provider scripted run; exits 0/1 against thresholds. |
|
|
77
|
+
| `voxarena compare` | Runs Gemini and OpenAI in parallel against the same script. |
|
|
78
|
+
| `voxarena report` | Generates a markdown comparison report from past runs. |
|
|
79
|
+
|
|
80
|
+
Run `voxarena <command> --help` for the full flag set.
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## Features
|
|
85
|
+
|
|
86
|
+
- 🎙️ **Provider-agnostic agent** — one Pipecat pipeline drives every provider; swap models without re-implementing your agent
|
|
87
|
+
- 🔁 **Scripted conversations** — multi-turn YAML scripts with pre-recorded WAV inputs and expected tool calls / response content
|
|
88
|
+
- 📊 **Automated scoring** — tool-call correctness, response matching, hallucination counts, time-to-first-audio, interruption-stop latency
|
|
89
|
+
- 🆚 **Side-by-side comparisons** — run multiple providers in parallel against the same script
|
|
90
|
+
- 🗄️ **Persistent run history** — JSON manifests on disk, indexed in SQLite
|
|
91
|
+
- 🖥️ **Web control panel** — React UI to launch runs, watch live status, browse results, and edit scripts
|
|
92
|
+
- 🧩 **Extensible** — add a new provider by implementing one adapter class
|
|
93
|
+
|
|
94
|
+
## Architecture
|
|
95
|
+
|
|
96
|
+
```mermaid
|
|
97
|
+
flowchart TD
|
|
98
|
+
A["Recorded WAVs<br/>script/audio/*.wav"] --> B["Injection Harness<br/>voxarena/harness.py"]
|
|
99
|
+
B --> C
|
|
100
|
+
|
|
101
|
+
subgraph C ["Pipecat Pipeline"]
|
|
102
|
+
direction LR
|
|
103
|
+
C1["Audio Injector"] --> C2["Provider Adapter"]
|
|
104
|
+
C2 --> C3["Audio Capture"]
|
|
105
|
+
C3 --> C4["Metrics Collector"]
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
C2 <--> D{{"Provider Backend"}}
|
|
109
|
+
D --> D1["Gemini Live"]
|
|
110
|
+
D --> D2["OpenAI Realtime"]
|
|
111
|
+
D --> D3["...future providers"]
|
|
112
|
+
|
|
113
|
+
C4 --> E["Run Manifest<br/>results/PROVIDER/RUN_ID/manifest.json"]
|
|
114
|
+
E --> F[("SQLite Index<br/>runs.db")]
|
|
115
|
+
|
|
116
|
+
F <--> G["voxarena CLI<br/>+ FastAPI Backend"]
|
|
117
|
+
G <--> H["React Control Panel<br/>ui/"]
|
|
118
|
+
|
|
119
|
+
style D1 fill:#4285F4,color:#fff,stroke:#333
|
|
120
|
+
style D2 fill:#10A37F,color:#fff,stroke:#333
|
|
121
|
+
style D3 fill:#999,color:#fff,stroke:#333
|
|
122
|
+
style F fill:#f5f5f5,stroke:#333
|
|
123
|
+
style H fill:#fff7da,stroke:#333
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
## Local Dev (with UI)
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
git clone https://github.com/simkeyur/vox-arena.git
|
|
130
|
+
cd vox-arena
|
|
131
|
+
cp .env.example .env # add GOOGLE_API_KEY / OPENAI_API_KEY
|
|
132
|
+
|
|
133
|
+
python3 -m venv .venv && source .venv/bin/activate
|
|
134
|
+
pip install -e .
|
|
135
|
+
|
|
136
|
+
uvicorn voxarena.main:app --reload --port 8000
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
Then in another terminal:
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
cd ui && npm install && npm run dev
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
Open the control panel at `http://localhost:5173`.
|
|
146
|
+
|
|
147
|
+
## Bring Your Own Agent
|
|
148
|
+
|
|
149
|
+
The demo ships with the "Saffron Leaf" restaurant agent so you can run end-to-end on day one. To evaluate your own:
|
|
150
|
+
|
|
151
|
+
1. Replace the system prompt and tool schemas in `voxarena/agent.py`
|
|
152
|
+
2. Implement (or stub) your tools in `voxarena/tools.py`
|
|
153
|
+
3. Re-record `script/audio/*.wav` and update `script/utterances.yaml` to reflect your real workload
|
|
154
|
+
4. Run the arena as normal — every provider gets scored against your scripts
|
|
155
|
+
|
|
156
|
+
## Scripted Conversations
|
|
157
|
+
|
|
158
|
+
Conversations live in [`script/utterances.yaml`](script/utterances.yaml). Each turn pairs an utterance id with an `expect` block describing the correct tool call and/or response content:
|
|
159
|
+
|
|
160
|
+
```yaml
|
|
161
|
+
- id: u04
|
|
162
|
+
text: "Are you open on Sundays?"
|
|
163
|
+
expect:
|
|
164
|
+
tool: get_hours
|
|
165
|
+
args:
|
|
166
|
+
day: sunday
|
|
167
|
+
response_contains:
|
|
168
|
+
- "closed"
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
The harness plays `script/audio/{id}.wav` into the pipeline and scores the agent's actual tool calls and transcript against `expect`.
|
|
172
|
+
|
|
173
|
+
## Configuration
|
|
174
|
+
|
|
175
|
+
| Variable | Description |
|
|
176
|
+
| --- | --- |
|
|
177
|
+
| `GOOGLE_API_KEY` / `OPENAI_API_KEY` | Provider credentials |
|
|
178
|
+
| `GEMINI_MODEL` / `OPENAI_MODEL` | Realtime model under test |
|
|
179
|
+
| `GEMINI_EVAL_MODEL` / `OPENAI_EVAL_MODEL` | Cheaper text models for grading |
|
|
180
|
+
| `PORT` | FastAPI server port |
|
|
181
|
+
| `BASE_DIR` | Override workdir (CLI: `--workdir`) |
|
|
182
|
+
|
|
183
|
+
## Contributing
|
|
184
|
+
|
|
185
|
+
To add a new provider: implement an adapter in `voxarena/providers/` following the pattern in `gemini.py` / `openai.py`, wire it into `voxarena/harness.py` and `voxarena/config.py`, and open a PR.
|
|
186
|
+
|
|
187
|
+
For bugs and feature requests, please open an issue.
|
|
188
|
+
|
|
189
|
+
## License
|
|
190
|
+
|
|
191
|
+
[MIT](LICENSE).
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "voxarena"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "An evaluation arena for realtime voice agents."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
license = { file = "LICENSE" }
|
|
12
|
+
authors = [{ name = "VoxArena contributors" }]
|
|
13
|
+
keywords = [
|
|
14
|
+
"voice-agents",
|
|
15
|
+
"realtime-llm",
|
|
16
|
+
"evaluation",
|
|
17
|
+
"benchmarking",
|
|
18
|
+
"gemini-live",
|
|
19
|
+
"openai-realtime",
|
|
20
|
+
"pipecat",
|
|
21
|
+
]
|
|
22
|
+
classifiers = [
|
|
23
|
+
"Development Status :: 3 - Alpha",
|
|
24
|
+
"Intended Audience :: Developers",
|
|
25
|
+
"License :: OSI Approved :: MIT License",
|
|
26
|
+
"Programming Language :: Python :: 3",
|
|
27
|
+
"Programming Language :: Python :: 3.11",
|
|
28
|
+
"Programming Language :: Python :: 3.12",
|
|
29
|
+
"Topic :: Software Development :: Testing",
|
|
30
|
+
]
|
|
31
|
+
dependencies = [
|
|
32
|
+
"fastapi>=0.110.0",
|
|
33
|
+
"uvicorn>=0.28.0",
|
|
34
|
+
"pydantic>=2.6.0",
|
|
35
|
+
"pydantic-settings>=2.2.0",
|
|
36
|
+
"pipecat-ai[openai,google]>=0.5.0",
|
|
37
|
+
"python-dotenv>=1.0.1",
|
|
38
|
+
"pyyaml>=6.0.1",
|
|
39
|
+
"loguru>=0.7.2",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
[project.optional-dependencies]
|
|
43
|
+
dev = ["pytest>=8.0.0"]
|
|
44
|
+
|
|
45
|
+
[project.scripts]
|
|
46
|
+
voxarena = "voxarena.cli:main"
|
|
47
|
+
|
|
48
|
+
[project.urls]
|
|
49
|
+
Homepage = "https://github.com/simkeyur/vox-arena"
|
|
50
|
+
Issues = "https://github.com/simkeyur/vox-arena/issues"
|
|
51
|
+
|
|
52
|
+
[tool.setuptools.packages.find]
|
|
53
|
+
where = ["."]
|
|
54
|
+
include = ["voxarena*"]
|
voxarena-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""Agent definition — system prompt + tool schemas.
|
|
2
|
+
|
|
3
|
+
The bundled defaults ship the "Saffron Leaf" restaurant assistant under
|
|
4
|
+
``voxarena/data/saffron_leaf/`` so VoxArena runs end-to-end out of the box.
|
|
5
|
+
|
|
6
|
+
To evaluate your own agent, point ``data_dir`` at a directory containing a
|
|
7
|
+
``system_prompt.txt`` and edit ``voxarena/tools.py`` to register your tools.
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import hashlib
|
|
12
|
+
import json
|
|
13
|
+
import os
|
|
14
|
+
from typing import Any, Dict, List, Optional
|
|
15
|
+
|
|
16
|
+
from voxarena.tools import TOOL_SCHEMAS
|
|
17
|
+
|
|
18
|
+
DEFAULT_DATA_DIR = os.path.join(os.path.dirname(__file__), "data", "saffron_leaf")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Agent:
|
|
22
|
+
"""A versioned, hashable bundle of (system prompt + tool schemas).
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
prompt_version: Manifest tag for the prompt revision.
|
|
26
|
+
tool_schema_version: Manifest tag for the tool schema revision.
|
|
27
|
+
data_dir: Directory containing ``system_prompt.txt``. Defaults to the
|
|
28
|
+
bundled Saffron Leaf example agent.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
prompt_version: str = "v1.0",
|
|
34
|
+
tool_schema_version: str = "v1.0",
|
|
35
|
+
data_dir: Optional[str] = None,
|
|
36
|
+
):
|
|
37
|
+
self.prompt_version = prompt_version
|
|
38
|
+
self.tool_schema_version = tool_schema_version
|
|
39
|
+
self.data_dir = data_dir or DEFAULT_DATA_DIR
|
|
40
|
+
|
|
41
|
+
self.prompt_path = os.path.join(self.data_dir, "system_prompt.txt")
|
|
42
|
+
self.system_prompt = self._load_system_prompt()
|
|
43
|
+
|
|
44
|
+
self.tool_schemas = TOOL_SCHEMAS
|
|
45
|
+
|
|
46
|
+
self.prompt_hash = self._sha256(self.system_prompt)
|
|
47
|
+
self.tool_schema_hash = self._sha256(json.dumps(self.tool_schemas, sort_keys=True))
|
|
48
|
+
|
|
49
|
+
def _load_system_prompt(self) -> str:
|
|
50
|
+
if not os.path.exists(self.prompt_path):
|
|
51
|
+
raise FileNotFoundError(f"System prompt file not found at {self.prompt_path}")
|
|
52
|
+
with open(self.prompt_path, "r") as f:
|
|
53
|
+
return f.read().strip()
|
|
54
|
+
|
|
55
|
+
@staticmethod
|
|
56
|
+
def _sha256(content: str) -> str:
|
|
57
|
+
return hashlib.sha256(content.encode("utf-8")).hexdigest()
|
|
58
|
+
|
|
59
|
+
def get_agent_metadata(self) -> Dict[str, Any]:
|
|
60
|
+
"""Versioning and checksum info recorded in manifests."""
|
|
61
|
+
return {
|
|
62
|
+
"prompt_version": self.prompt_version,
|
|
63
|
+
"prompt_hash": self.prompt_hash,
|
|
64
|
+
"tool_schema_version": self.tool_schema_version,
|
|
65
|
+
"tool_schema_hash": self.tool_schema_hash,
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
def get_openai_tools(self) -> List[Dict[str, Any]]:
|
|
69
|
+
return [
|
|
70
|
+
{
|
|
71
|
+
"type": "function",
|
|
72
|
+
"name": s["name"],
|
|
73
|
+
"description": s["description"],
|
|
74
|
+
"parameters": s["parameters"],
|
|
75
|
+
}
|
|
76
|
+
for s in self.tool_schemas
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
def get_gemini_tools(self) -> List[Dict[str, Any]]:
|
|
80
|
+
return [
|
|
81
|
+
{
|
|
82
|
+
"function_declarations": [
|
|
83
|
+
{
|
|
84
|
+
"name": s["name"],
|
|
85
|
+
"description": s["description"],
|
|
86
|
+
"parameters": s["parameters"],
|
|
87
|
+
}
|
|
88
|
+
]
|
|
89
|
+
}
|
|
90
|
+
for s in self.tool_schemas
|
|
91
|
+
]
|