vent-hq 0.13.0 → 0.13.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -3
- package/dist/index.mjs +16 -103
- package/dist/{package-V2MEPDXW.mjs → package-GODDS4TH.mjs} +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# vent-hq
|
|
2
2
|
|
|
3
|
-
**Agent CLI for voice AI development.** Lets coding agents (Claude Code, Cursor, Codex) place real calls against your voice agent and read back transcripts, latency, audio, tool calls, and 60+ computed metrics — so they can iterate on prompts, flows, and platform config based on what actually happened.
|
|
3
|
+
**Agent CLI for voice AI development.** Lets coding agents (Claude Code, Cursor, Codex, Windsurf) place real calls against your voice agent and read back transcripts, latency, audio, tool calls, and 60+ computed metrics — so they can iterate on prompts, flows, and platform config based on what actually happened.
|
|
4
4
|
|
|
5
5
|
Works with **Vapi, Retell, LiveKit, ElevenLabs, Bland, and custom WebSocket endpoints**.
|
|
6
6
|
|
|
@@ -24,7 +24,7 @@ npx vent-hq@latest init
|
|
|
24
24
|
|
|
25
25
|
`init` will:
|
|
26
26
|
- Authenticate via GitHub (if you have `gh` installed) or open a browser for device-code auth
|
|
27
|
-
- Install skill files for Claude Code (`.claude/skills/vent/SKILL.md`), Cursor (`.cursor/rules/vent.mdc`),
|
|
27
|
+
- Install skill files for Claude Code (`.claude/skills/vent/SKILL.md`), Cursor (`.cursor/rules/vent.mdc`), Codex (`AGENTS.md`), and Windsurf (`.windsurf/skills/vent/SKILL.md`)
|
|
28
28
|
- Scaffold a starter suite at `.vent/suite.json`
|
|
29
29
|
|
|
30
30
|
After `init`, your coding agent reads the skill file and takes over from there.
|
|
@@ -61,7 +61,6 @@ Swap `adapter` for `retell`, `livekit`, `elevenlabs`, `bland`, or `websocket` to
|
|
|
61
61
|
| `vent-hq init` | One-time setup: auth, skill files, starter suite |
|
|
62
62
|
| `vent-hq run -f <suite.json>` | Run a call (or all calls) from a suite, stream results |
|
|
63
63
|
| `vent-hq run -f <s> --call <name>` | Run a single named call |
|
|
64
|
-
| `vent-hq status <run-id>` | Check or stream the status of a previous run |
|
|
65
64
|
| `vent-hq stop <run-id>` | Cancel a queued or running call |
|
|
66
65
|
| `vent-hq agent start -f <s>` | Keep a relay session open for a local WebSocket agent |
|
|
67
66
|
| `vent-hq login` / `logout` | Manage credentials |
|
package/dist/index.mjs
CHANGED
|
@@ -4157,7 +4157,6 @@ var ConversationCallSpecSchema = external_exports.object({
|
|
|
4157
4157
|
max_turns: external_exports.number().int().min(1).max(50).default(6),
|
|
4158
4158
|
silence_threshold_ms: external_exports.number().int().min(200).max(1e4).optional(),
|
|
4159
4159
|
persona: CallerPersonaSchema,
|
|
4160
|
-
prosody: external_exports.boolean().optional(),
|
|
4161
4160
|
caller_audio: CallerAudioEffectsSchema.optional(),
|
|
4162
4161
|
/** ISO 639-1 language code for multilingual calls (e.g., "es", "fr", "de"). Caller speaks this language, STT transcribes it, judge evaluates in it. */
|
|
4163
4162
|
language: external_exports.string().min(2).max(5).optional(),
|
|
@@ -4224,7 +4223,7 @@ var LiveKitPlatformSchema = BasePlatformSchema.extend({
|
|
|
4224
4223
|
livekit_api_secret: external_exports.string().optional(),
|
|
4225
4224
|
livekit_url: external_exports.string().optional(),
|
|
4226
4225
|
livekit_agent_name: external_exports.string().optional()
|
|
4227
|
-
});
|
|
4226
|
+
}).strict();
|
|
4228
4227
|
var VapiPlatformSchema = BasePlatformSchema.extend({
|
|
4229
4228
|
provider: external_exports.literal("vapi"),
|
|
4230
4229
|
vapi_api_key: external_exports.string().optional(),
|
|
@@ -4298,25 +4297,6 @@ var LatencyMetricsSchema = external_exports.object({
|
|
|
4298
4297
|
mouth_to_ear_est_ms: external_exports.number().optional(),
|
|
4299
4298
|
drift_slope_ms_per_turn: external_exports.number().optional()
|
|
4300
4299
|
});
|
|
4301
|
-
var TurnEmotionProfileSchema = external_exports.object({
|
|
4302
|
-
turn_index: external_exports.number().int().min(0),
|
|
4303
|
-
emotions: external_exports.record(external_exports.string(), external_exports.number()),
|
|
4304
|
-
calmness: external_exports.number(),
|
|
4305
|
-
confidence: external_exports.number(),
|
|
4306
|
-
frustration: external_exports.number(),
|
|
4307
|
-
warmth: external_exports.number(),
|
|
4308
|
-
uncertainty: external_exports.number()
|
|
4309
|
-
});
|
|
4310
|
-
var ProsodyMetricsSchema = external_exports.object({
|
|
4311
|
-
per_turn: external_exports.array(TurnEmotionProfileSchema),
|
|
4312
|
-
mean_calmness: external_exports.number(),
|
|
4313
|
-
mean_confidence: external_exports.number(),
|
|
4314
|
-
peak_frustration: external_exports.number(),
|
|
4315
|
-
emotion_consistency: external_exports.number(),
|
|
4316
|
-
naturalness: external_exports.number(),
|
|
4317
|
-
emotion_trajectory: external_exports.enum(["stable", "improving", "degrading", "volatile"]),
|
|
4318
|
-
hume_latency_ms: external_exports.number()
|
|
4319
|
-
});
|
|
4320
4300
|
var HarnessOverheadSchema = external_exports.object({
|
|
4321
4301
|
tts_per_turn_ms: external_exports.array(external_exports.number()),
|
|
4322
4302
|
stt_per_turn_ms: external_exports.array(external_exports.number()),
|
|
@@ -4398,7 +4378,6 @@ var ConversationMetricsSchema = external_exports.object({
|
|
|
4398
4378
|
latency: LatencyMetricsSchema.optional(),
|
|
4399
4379
|
tool_calls: ToolCallMetricsSchema.optional(),
|
|
4400
4380
|
signal_quality: SignalQualityMetricsSchema.optional(),
|
|
4401
|
-
prosody: ProsodyMetricsSchema.optional(),
|
|
4402
4381
|
harness_overhead: HarnessOverheadSchema.optional(),
|
|
4403
4382
|
component_latency: ComponentLatencyMetricsSchema.optional()
|
|
4404
4383
|
});
|
|
@@ -4457,7 +4436,6 @@ function formatConversationResult(raw, options = {}) {
|
|
|
4457
4436
|
const warnings = dedupeStrings([
|
|
4458
4437
|
...formatProviderWarningMessages(r.call_metadata?.provider_warnings)
|
|
4459
4438
|
]);
|
|
4460
|
-
const emotion = r.metrics?.prosody ? formatEmotion(r.metrics.prosody) : null;
|
|
4461
4439
|
const result = {
|
|
4462
4440
|
name: r.name ?? null,
|
|
4463
4441
|
status: r.status,
|
|
@@ -4470,7 +4448,6 @@ function formatConversationResult(raw, options = {}) {
|
|
|
4470
4448
|
call_metadata: formatCallMetadata(r.call_metadata, verbose)
|
|
4471
4449
|
};
|
|
4472
4450
|
if (warnings.length > 0) result.warnings = warnings;
|
|
4473
|
-
if (emotion) result.emotion = emotion;
|
|
4474
4451
|
if (verbose) result.caller_prompt = r.caller_prompt;
|
|
4475
4452
|
if (debug2) result.debug = debug2;
|
|
4476
4453
|
return result;
|
|
@@ -4577,15 +4554,6 @@ function stripExecutionMessage(args) {
|
|
|
4577
4554
|
const { execution_message: _drop, ...rest } = args;
|
|
4578
4555
|
return rest;
|
|
4579
4556
|
}
|
|
4580
|
-
function formatEmotion(prosody) {
|
|
4581
|
-
return {
|
|
4582
|
-
naturalness: prosody.naturalness,
|
|
4583
|
-
mean_calmness: prosody.mean_calmness,
|
|
4584
|
-
mean_confidence: prosody.mean_confidence,
|
|
4585
|
-
peak_frustration: prosody.peak_frustration,
|
|
4586
|
-
emotion_trajectory: prosody.emotion_trajectory
|
|
4587
|
-
};
|
|
4588
|
-
}
|
|
4589
4557
|
function formatComponentLatency(cl, verbose) {
|
|
4590
4558
|
if (!cl) return null;
|
|
4591
4559
|
const result = {
|
|
@@ -4648,7 +4616,6 @@ function formatDebug(result) {
|
|
|
4648
4616
|
const debug2 = compactUnknownRecord({
|
|
4649
4617
|
signal_quality: result.metrics?.signal_quality,
|
|
4650
4618
|
harness_overhead: result.metrics?.harness_overhead,
|
|
4651
|
-
prosody: result.metrics?.prosody,
|
|
4652
4619
|
provider_warnings: nonEmptyArray(result.call_metadata?.provider_warnings),
|
|
4653
4620
|
component_latency_per_turn: nonEmptyArray(result.metrics?.component_latency?.per_turn),
|
|
4654
4621
|
observed_tool_calls: formatDebugToolCalls(result.observed_tool_calls),
|
|
@@ -4710,9 +4677,6 @@ function stdoutSync(data) {
|
|
|
4710
4677
|
}
|
|
4711
4678
|
}
|
|
4712
4679
|
}
|
|
4713
|
-
function writeJsonStdout(value) {
|
|
4714
|
-
stdoutSync(JSON.stringify(value, null, 2) + "\n");
|
|
4715
|
-
}
|
|
4716
4680
|
var bold = (s) => isTTY ? `\x1B[1m${s}\x1B[0m` : s;
|
|
4717
4681
|
var dim = (s) => isTTY ? `\x1B[2m${s}\x1B[0m` : s;
|
|
4718
4682
|
var green = (s) => isTTY ? `\x1B[32m${s}\x1B[0m` : s;
|
|
@@ -4819,7 +4783,7 @@ function printSummary(callResults, runComplete, runId, options = {}) {
|
|
|
4819
4783
|
stdoutSync(" " + parts.join(" ") + "\n");
|
|
4820
4784
|
}
|
|
4821
4785
|
}
|
|
4822
|
-
stdoutSync(dim(`
|
|
4786
|
+
stdoutSync(dim(`Run ID: ${runId}`) + "\n");
|
|
4823
4787
|
}
|
|
4824
4788
|
function buildRunSummaryJson(options) {
|
|
4825
4789
|
const calls = options.rawCalls ? formatRawCalls(options.rawCalls, options.verbose ?? false) : options.formattedCalls ?? [];
|
|
@@ -5788,45 +5752,6 @@ function findFreePort() {
|
|
|
5788
5752
|
});
|
|
5789
5753
|
}
|
|
5790
5754
|
|
|
5791
|
-
// src/commands/status.ts
|
|
5792
|
-
async function statusCommand(args) {
|
|
5793
|
-
const accessToken = await loadAccessToken();
|
|
5794
|
-
if (!accessToken) {
|
|
5795
|
-
printError("No Vent access token found. Run `npx vent-hq init` first.");
|
|
5796
|
-
return 2;
|
|
5797
|
-
}
|
|
5798
|
-
try {
|
|
5799
|
-
const res = await apiFetch(`/runs/${args.runId}`, accessToken);
|
|
5800
|
-
const data = await res.json();
|
|
5801
|
-
const aggregate = data.aggregate;
|
|
5802
|
-
const counts = aggregate?.conversation_calls;
|
|
5803
|
-
const results = Array.isArray(data.results) ? data.results : [];
|
|
5804
|
-
const summary = buildRunSummaryJson({
|
|
5805
|
-
runId: typeof data.id === "string" ? data.id : args.runId,
|
|
5806
|
-
status: data.status,
|
|
5807
|
-
total: counts?.total,
|
|
5808
|
-
passed: counts?.passed,
|
|
5809
|
-
failed: counts?.failed,
|
|
5810
|
-
rawCalls: results,
|
|
5811
|
-
verbose: args.verbose,
|
|
5812
|
-
runDetails: {
|
|
5813
|
-
created_at: data.created_at,
|
|
5814
|
-
started_at: data.started_at,
|
|
5815
|
-
finished_at: data.finished_at,
|
|
5816
|
-
duration_ms: data.duration_ms,
|
|
5817
|
-
error_text: data.error_text,
|
|
5818
|
-
aggregate: data.aggregate
|
|
5819
|
-
}
|
|
5820
|
-
});
|
|
5821
|
-
writeJsonStdout(summary);
|
|
5822
|
-
const status = data.status;
|
|
5823
|
-
return status === "pass" ? 0 : status === "fail" ? 1 : 0;
|
|
5824
|
-
} catch (err) {
|
|
5825
|
-
printError(err.message);
|
|
5826
|
-
return 2;
|
|
5827
|
-
}
|
|
5828
|
-
}
|
|
5829
|
-
|
|
5830
5755
|
// src/lib/browser.ts
|
|
5831
5756
|
import { exec } from "node:child_process";
|
|
5832
5757
|
function openBrowser(url) {
|
|
@@ -5936,13 +5861,16 @@ import * as fs5 from "node:fs/promises";
|
|
|
5936
5861
|
import * as path3 from "node:path";
|
|
5937
5862
|
|
|
5938
5863
|
// src/skills/claude-code.md
|
|
5939
|
-
var claude_code_default = '---\nname: vent\ndescription: Place real calls against the user\'s voice agent to verify recent code changes worked end-to-end. Trigger on test requests or after any non-trivial voice-agent change.\nallowed-tools: Bash(npx vent-hq *)\n---\n\n# Vent - Voice Agent Calls\n\n## When to Test\n\nVent is the voice CI gate. Auto-trigger a call after any non-trivial voice-affecting change \u2014 don\'t wait to be asked. Non-trivial means anything that affects what the caller hears, how the agent responds, what tools are called, how the call routes, or audio timing: prompt rewrites or sections (not one-word tweaks), tool schema or handler changes, state/memory logic, transfer logic, provider/audio config, platform integration, and any voice-behavior bug fix. One-line copy or comment edits don\'t need a call.\n\nBefore launching, scan `.vent/runs/` for unresolved issues from prior runs (failing status, regressions, warnings, latency jumps); fix in-scope ones as part of this change and verify they\'re resolved by the new run, surface out-of-scope ones rather than silently leaving them. The task is not done until you\'ve reviewed the transcript, tool calls, and observed behavior and judged that the change works \u2014 Vent\'s `status` and exit code only reflect pipeline completion, not mission success.\n\n## Autonomous Iteration\n\n**Default: iterate.** Run a fix \u2192 Vent-call \u2192 fix \u2192 Vent-call loop whenever a call surfaces a fixable issue. Don\'t report a failure (or a degraded-quality success) back to the user before attempting to fix it \u2014 they want the agent working, not a diagnosis. After each call, if the result reveals a problem you can fix with a concrete plan, apply the fix and run again. Fixable issues include: tool schema or registry gaps, missing handlers, prompt phrasing, **STT vocabulary mismatches** (e.g. domain terms misheard \u2014 Deepgram `keyterms`, custom vocabulary, prompt-level term lists), **latency tunables** (endpointing thresholds, preemptive generation, model selection), and audio quality config. If you\'d be guessing at the next attempt, stop and ask the user instead of trying.\n\n**"Mission success but with quality problems" still requires iteration.** If the call ran end-to-end but had noticeably bad latency, STT mishearing the agent\'s domain vocabulary, audio artifacts, or any other caller-experience degradation that has a concrete fix, the call is *not done* \u2014 apply the fix and re-run. Don\'t downgrade fixable problems to "observations" or "things worth flagging." The pipeline running cleanly is the floor, not the ceiling.\n\n**Never ask permission to iterate.** "Want me to apply this and re-run?" / "Should I fix it?" / "Let me know if you want me to..." \u2014 these are skill violations. If you have a concrete fix, apply it and run the next call. The user opts out by saying so explicitly; silence is not opt-out.\n\nStop iterating and report when one of:\n\n- The next call confirms the fix worked AND no further fixable issues remain (success \u2014 done).\n- The same fix fails twice (your hypothesis is wrong; ask the user).\n- The failure mode shifts between attempts (you\'re chasing a moving target).\n- You can\'t justify the next call against its provider cost \u2014 each call spends real money and provider quota.\n\nIf the user explicitly opts out ("just report", "don\'t iterate", "stop after the first call", "I\'ll fix it myself"), respect that \u2014 run once, report, stop. Otherwise iterate by default. The first time you start an autonomous loop in a session, mention it once so the user knows it\'s happening (e.g. "the call surfaced X \u2014 fixing and re-running; I\'ll report back when it converges or stops making progress").\n\n## Claude Code Execution\n\nUse a 5-minute shell-tool timeout (`300000` ms) on Vent run commands so normal calls are not killed by the default 2-minute Bash timeout. This is not backgrounding; wait for stdout/results before ending your response. Use the JSON returned by `npx vent-hq run` directly; do not call `vent status` unless checking an older run.\n\nClaude Code serializes separate Bash tool calls for `npx vent-hq ...`, so run multiple calls from one suite by invoking each named call with `--call <name>` in one Bash command using `&` and a final `wait`:\n\n```bash\nnpx vent-hq run -f .vent/suite.vapi.json --call happy-path & \\\nnpx vent-hq run -f .vent/suite.vapi.json --call tool-path & \\\nwait\n```\n\nFor long-running composite commands \u2014 e.g. LiveKit\'s required `kill workers \u2192 start fresh worker \u2192 wait 60s \u2192 npx vent-hq run` preflight \u2014 keep the entire sequence in **one foreground Bash call**. Do not use `Monitor` or `run_in_background: true` for the wait. Both end the turn until an event fires, which means Vent never sees an inflight call and the user reads it as "you backgrounded the test." The harness also blocks bare `sleep 60` (any long leading sleep), so fold the readiness check and the wait into a single `until`-loop \u2014 short sleeps inside a polling loop are allowed because each iteration is real work:\n\n```bash\nnohup npm run dev > /tmp/lk-agent.log 2>&1 &\nstart=$(date +%s); \\\nuntil grep -q "registered worker" /tmp/lk-agent.log 2>/dev/null \\\n && [ $(($(date +%s) - start)) -ge 60 ]; do sleep 2; done \\\n&& npx vent-hq run -f .vent/suite.livekit.json --call <name>\n```\n\nFor these composites, raise the Bash timeout to **10 minutes** (`600000` ms) \u2014 startup + 60s wait + call can easily exceed the 5-minute default.\n\n## Workflow\n\n1. Identify the behavior under test. Read enough of the agent codebase to understand its system prompt, tools, handlers, routes, provider config, platform wiring, and expected handoffs.\n2. Reuse an existing `.vent/suite.<adapter>.json` when possible. If `.vent/` contains multiple suites, inspect `connection.adapter` and report which suite file produced the result.\n3. Create or update a suite only when the existing calls do not cover the changed behavior. Name calls after real flows, for example `reschedule-appointment`, not `call-1`.\n4. If the suite uses `start_command`, start one shared local session first with `npx vent-hq agent start -f .vent/suite.<adapter>.json`, then pass `--session <session-id>` to each run.\n\n **For locally-run LiveKit agents: every run requires killing *all* workers, starting one fresh worker, and waiting a full 60 seconds before submitting.** Unconditional \u2014 LiveKit Cloud round-robins across registered workers, so a single survivor with a dead inference subprocess fails ~N-1 of N calls. Don\'t rely on `pkill -f <path-pattern>`; bare command lines like `node --import tsx agent.ts dev` won\'t match a path filter. Use `ps aux | grep -E "node.*agent\\.ts|@livekit/agents.*ipc"`, `kill -9` by PID, re-run `ps` to confirm zero survivors, then start the fresh worker. Skipping the 60s wait fails with `did not publish audio track`; if that error appears alongside `Error [ERR_IPC_CHANNEL_CLOSED] from InferenceProcExecutor.doInference` in the agent log right after a "running EOU detection" line, that\'s a straggler \u2014 redo the kill sweep. Hosted LiveKit Cloud agents don\'t need any of this; run normally. (See `Claude Code Execution` for how to compose the kill\u2192start\u2192wait\u2192submit as a single foreground command.)\n5. Pick which call(s) to run based on the change. Fixed bug: replay the failing scenario. Changed tool: include a call that triggers that tool. Prompt or routing change: include the relevant happy path and any important edge path.\n6. Compare against the previous JSON in `.vent/runs/` when validating a fix or regression. Check status flips, latency jumps, tool-call success drops, cost jumps, and transcript divergence. Correlate with `git diff` between saved `git_sha` values when available; skip if no previous run exists.\n\n## Commands\n\n```bash\nnpx vent-hq init # First-time setup (auth + skill install + starter suite)\nnpx vent-hq login # Log in to existing account\nnpx vent-hq run -f .vent/suite.X.json # Run a single-call suite\nnpx vent-hq run -f .vent/suite.X.json --call NAME # Run one named call from a multi-call suite\nnpx vent-hq run ... --session <session-id> # Add to any run; routes through an existing local relay session\nnpx vent-hq run ... --verbose # Add to any run or status; include verbose debug fields\nnpx vent-hq stop <run-id> # Cancel a queued or running run\nnpx vent-hq status <run-id> # Fetch results for a previous run\nnpx vent-hq agent start -f .vent/suite.X.json # Start a shared local relay session\nnpx vent-hq agent stop <session-id> # Stop a shared local relay session\n```\n\nIf `~/.vent/credentials` is missing and `VENT_ACCESS_TOKEN` is not set, run `npx vent-hq init`. For an existing account, run `npx vent-hq login` or set `VENT_ACCESS_TOKEN`.\n\n## Suite Config\n\nSuites live in `.vent/suite.<adapter>.json`. `connection` is declared once per suite. `calls` is a named map, and each key becomes the call name used with `--call`.\n\nLocal websocket suite:\n\n```json\n{\n "connection": {\n "adapter": "websocket",\n "start_command": "npm run start",\n "health_endpoint": "/health",\n "agent_port": 3001\n },\n "calls": {\n "happy-path": {\n "caller_prompt": "You are Maria calling to reschedule her appointment to next Tuesday.",\n "max_turns": 8,\n "silence_threshold_ms": 1200\n }\n }\n}\n```\n\nPlatform-direct suite:\n\n```json\n{\n "connection": {\n "adapter": "vapi",\n "platform": { "provider": "vapi" }\n },\n "calls": {\n "happy-path": {\n "caller_prompt": "You are Maria calling to reschedule her appointment to next Tuesday.",\n "max_turns": 8\n }\n }\n}\n```\n\nWrite `caller_prompt` as a realistic caller with a name, goal, mood, constraints, and conditional behavior. Set `max_turns` based on flow complexity: FAQ `4-6`, booking or tool use `8-12`, complex flows `12-20`.\n\nCall fields:\n\n- `caller_prompt` and `max_turns` are required.\n- `silence_threshold_ms` must be `200-10000`. Common ranges: FAQ `800-1200`, tool calls `2000-3000`, complex reasoning `3000-5000`.\n- `persona` supports `pace`, `clarity`, `disfluencies`, `cooperation`, `emotion`, `memory`, `intent_clarity`, and `confirmation_style`.\n- `caller_audio` supports noise, speed, speakerphone, mic distance, clarity, accent, packet loss, and jitter.\n- `language` is an ISO 639-1 code such as `en`, `es`, `fr`, `de`, `it`, `nl`, or `ja`.\n- `voice` is `"male"` or `"female"` (English only; default female). Use to flip the caller\'s perceived gender. Ignored if `caller_audio.accent` is set or `language` is non-English.\n- `prosody: true` enables emotion analysis and requires Hume access.\n\n## Connections and Credentials\n\n### Adapter choice\n\nUse `websocket` for your own local or hosted runtime. Use `start_command` for local agents or `agent_url` for hosted custom endpoints. For `start_command` and `agent_url`, do not put Deepgram, ElevenLabs, OpenAI, or other agent runtime keys into Vent config unless the Vent adapter itself needs them \u2014 the tested agent owns its own runtime credentials.\n\nUse `vapi`, `retell`, `elevenlabs`, `bland`, or `livekit` for platform-direct testing. In this mode Vent itself talks to the provider on the user\'s behalf.\n\nVent provides `DEEPGRAM_API_KEY` and `ANTHROPIC_API_KEY` for its hosted caller/evaluation stack \u2014 those are Vent\'s, not the tested agent\'s.\n\n### Credential resolution\n\nIn platform-direct mode the CLI auto-resolves credentials from `.env.local`, `.env`, and the current shell environment. Do not run `source .env && export` before Vent commands. If you include credential fields in JSON, use the actual value, not the env var name. Do not manually author `platform_connection_id`; the CLI creates or updates the saved platform connection automatically.\n\nAuto-resolved env vars and JSON fields:\n\n- Vapi: `VAPI_API_KEY` -> `vapi_api_key`; `VAPI_ASSISTANT_ID` or `VAPI_AGENT_ID` -> `vapi_assistant_id`\n- Bland: `BLAND_API_KEY` -> `bland_api_key`; `BLAND_PATHWAY_ID` -> `bland_pathway_id`; `BLAND_PERSONA_ID` -> `persona_id`\n- LiveKit: `LIVEKIT_API_KEY` -> `livekit_api_key`; `LIVEKIT_API_SECRET` -> `livekit_api_secret`; `LIVEKIT_URL` -> `livekit_url`\n- Retell: `RETELL_API_KEY` -> `retell_api_key`; `RETELL_AGENT_ID` -> `retell_agent_id`\n- ElevenLabs: `ELEVENLABS_API_KEY` -> `elevenlabs_api_key`; `ELEVENLABS_AGENT_ID` -> `elevenlabs_agent_id`\n\n### Provider config\n\nUse existing provider config when possible: Vapi assistant, Retell agent, ElevenLabs agent, Bland pathway, or LiveKit agent. Bland uniquely supports inline config \u2014 `platform` may use `bland_pathway_id`, `persona_id`, or an inline `task` (with optional voice, model, and turn-handling overrides; see Bland\'s API docs for the full field list).\n\n### Concurrency\n\nWhen you fan out multiple Vent calls in parallel against the same provider (for example, running several named calls from one suite at once with `&` and `wait`), respect the provider\'s per-account concurrency limit. Exceeding it makes calls queue or fail at the provider \u2014 Vent does not enforce these caps for you.\n\nRecord the limit as `max_concurrency` in the suite\'s `platform` block so it\'s visible on future runs. Ask the user which plan they\'re on if sizing matters; otherwise use the conservative default in bold.\n\n- **Vapi**: **10** included per account; reserved lines can be purchased self-serve; Enterprise is unlimited.\n- **Retell**: Pay-as-you-go includes **20**; Enterprise has no cap.\n- **Bland**: Start=**10**, Build=50, Scale=100, Enterprise=unlimited.\n- **ElevenLabs**: Free=**4**, Starter=6, Creator=10, Pro=20, Scale=30, Business=30. Burst pricing can temporarily allow up to 3x base.\n- **LiveKit Cloud**: Build=**5**, Ship=20, Scale=50 managed inference sessions (the usual gate for voice agents); agent-session concurrency can go higher (Scale up to 600).\n\n## WebSocket\n\nFor `adapter: "websocket"`, Vent sends binary 16-bit mono PCM audio over one websocket connection. Websocket text frames are optional JSON events. Audio-only websocket agents still work, but events improve turn detection and observability. Vent sends `{"type":"end-call"}` when the test is done.\n\nUseful websocket text frames:\n\n```jsonc\n{"type":"speech-update","status":"started"}\n{"type":"speech-update","status":"stopped"}\n{"type":"tool_call","name":"check_availability","arguments":{},"result":{},"successful":true,"duration_ms":150}\n{"type":"vent:timing","stt_ms":120,"llm_ms":450,"tts_ms":80}\n{"type":"vent:session","platform":"custom","provider_call_id":"call_123","provider_session_id":"session_abc"}\n{"type":"vent:call-metadata","call_metadata":{"recording_url":"https://...","cost_usd":0.12}}\n{"type":"vent:transcript","role":"caller","text":"I need to reschedule","turn_index":0}\n{"type":"vent:transfer","destination":"+15551234567","status":"attempted"}\n{"type":"vent:debug-url","label":"trace","url":"https://..."}\n{"type":"vent:warning","message":"provider warning","code":"provider_warning"}\n```\n\n`vent:session-report` is **not** handled by the websocket adapter \u2014 it\'s only consumed by the LiveKit helper. Do not emit it from a websocket agent.\n\nPlatform adapters capture tool calls automatically. Websocket agents must emit `tool_call` frames for tool observability. Platform adapters get component latency automatically. Websocket agents should emit `vent:timing` after each agent response when STT/LLM/TTS breakdown is available.\n\n## LiveKit\n\nBefore running LiveKit tests, install and add the Vent helper to the LiveKit agent entrypoint. Node: `npm install @vent-hq/livekit`, then call `instrumentLiveKitAgent({ ctx, session })`. Python: `pip install vent-livekit`, then call `instrument_livekit_agent(ctx=ctx, session=session)`.\n\nLiveKit direct mode requires the LiveKit Agents SDK. Custom LiveKit participants should use the websocket adapter with a relay. If the LiveKit agent registered with an explicit dispatch name, set `livekit_agent_name` in `platform`.\n\nLiveKit does not support multiple concurrent Vent calls against one agent process yet. Run LiveKit calls sequentially unless you intentionally start separate agent worker processes and route each call to its own process. For Node agents, that means separate Node.js processes. Do not treat parallel calls against a single LiveKit worker as a valid concurrency test until multi-call support is engineered.\n\nUse the LiveKit helper for observability; do not publish `vent:*` topics manually. Do not hand-roll `vent:session-report` from `ctx.addShutdownCallback`; after `room.disconnect()` it can fail with `engine is closed`. The helper captures SDK metrics, tool events, conversation items, usage, and close events. Native LiveKit `lk.transcription` and `lk.agent.state` provide transcript and agent-state timing.\n\n## Output\n\n### Live result\n\n`npx vent-hq run` returns a single JSON result on stdout in non-TTY mode (not an SSE JSONL stream). Exit codes: `0` = call ran through the pipeline; `1` = pipeline-level failure; `2` = harness error.\n\nMost result fields are always present; `latency`, `component_latency`, `call_metadata`, and `emotion` may be `null` when the underlying analysis didn\'t run; `debug` is absent without `--verbose`. Branch on null before reading nested fields. Use `--verbose` only when the default doesn\'t explain a failure \u2014 when you need `platform_transcript` (to check Vent\'s STT), per-turn or component-level latency breakdowns, the raw tool-call timeline, or provider-native artifacts in `debug.provider_metadata`. Otherwise skip \u2014 it just adds noise.\n\nVent\'s transcript is ground truth. Judge on semantic intent: ignore homophones and minor mis-hears (`"check teach hat"` for `"check that"`, missing question marks on short tails) \u2014 those are streaming-STT noise on Vent\'s caller side, not agent bugs, and **don\'t surface them in the report** (they\'re Vent-side artifacts, not actionable for the user). But clear gibberish or word-soup (e.g. `"Cristoxin"` where the agent should have said `"Of course, talk soon"`) is **not** a Vent artifact \u2014 Vent\'s STT does not invent words like that. It means the platform\'s TTS produced corrupted audio or the agent\'s STT/LLM generated the wrong text, and the fix lives there (TTS voice config, agent prompt, model temperature, codec issue). Never dismiss the run as a "Vent harness STT" issue; iterate on the agent or flag the platform.\n\nFor transfers: `call_metadata.transfer_attempted` (provider claimed) and `transfer_completed` (Vent-verified) can disagree \u2014 report both. `transfers[]` carries destination, type, and per-attempt status.\n\n### Saved history\n\nAfter every run, Vent writes the full result JSON to `.vent/runs/`. Shape:\n\n```jsonc\n{\n "run_id": "...",\n "timestamp": "2026-04-21T...Z",\n "git_sha": "...",\n "summary": { "calls_total": 2, "total_duration_ms": 12345, "total_cost_usd": 0.01 },\n "call_results": [\n { "name": "happy-path", "status": "completed", "duration_ms": 6123, "transcript": [], "observed_tool_calls": [], "latency": { "response_time_ms": 420, "p95_response_time_ms": 980 }, "call_metadata": { "cost_usd": 0.004 } }\n ]\n}\n```\n\nWhen comparing against a prior run (Workflow step 6), inspect:\n\n- Run-completion status flips: `call_results[i].status` (pipeline-only \u2014 judge mission success from the transcript)\n- Latency: `call_results[i].latency.response_time_ms` (mean) or `latency.p95_response_time_ms` increased >20%\n- Tool calls: count of `call_results[i].observed_tool_calls[].successful` dropped\n- Cost: `summary.total_cost_usd` or `call_results[i].call_metadata.cost_usd` increased >30%\n- Transcript: `call_results[i].transcript` diverged in semantic content (ignore STT noise)\n\n## Reporting Results\n\nBefore reporting, read the agent\'s code to locate where the observed behavior originates. If the issue is small and you can fix it, fix it and explain what you did \u2014 don\'t ask permission first.\n\nAdapt the report shape to the call \u2014 a clean pass needs little, a regression with a multi-layer cause needs more. Use a transcript excerpt when it helps the user see what happened.\n\nHard rules:\n\n- Pair raw numbers with their plain-English meaning \u2014 don\'t drop the number, but don\'t leave it unexplained. E.g. "p95 latency was 850ms, which is snappy and well within natural conversational pacing" or "p95 hit 1.6 seconds with the LLM as the bottleneck \u2014 noticeably sluggish to a caller."\n- Name the user\'s voice agent by platform on first mention (e.g. "the Vapi agent responded snappily throughout") so the user knows immediately which agent the observation is about. After that, just say "the agent" \u2014 don\'t repeat the platform name on every line.\n- Always include the recording from `call_metadata.recording_url` as an inline `[Recording](url)` link, placed in **one block at the very end of the report** \u2014 never sprinkled through the prose. Single call: one link as the last line. Multi-call: one labeled link per call (e.g. `reschedule-appointment: [Recording](url)`). Never paste a bare URL.\n- Mission success is your judgment, not Vent\'s. The per-call `status` is only `"completed"` (pipeline ran) or `"error"` (pipeline failed); decide whether the agent actually accomplished the scenario from the transcript and tool calls.\n- Similar-sounding word substitutions (e.g. "ocean" for "OSHA") are STT ambiguity, not comprehension failure. The fix lives in STT keyword hints, custom vocabulary, or a prompt-level term list \u2014 not the agent\'s reasoning.\n- Surface only what the user can act on in their own agent\'s code or config \u2014 never `warnings[]` (infrastructure noise), Vent-side artifacts (caller wait modes, harness timing, internal pipeline quirks), or `cost_usd` unless asked.\n\nFor multi-call runs, lead with your own judgment of what happened across the calls (e.g. "3 of 4 did what they were supposed to; `cancel-appointment` never actually canceled"), not a parroted pass/fail count. Then cover each call with whatever depth it needs.\n';
|
|
5864
|
+
var claude_code_default = '---\nname: vent\ndescription: Place real calls against the user\'s voice agent to verify recent code changes worked end-to-end. Trigger on test requests or after any non-trivial voice-agent change.\nallowed-tools: Bash(npx vent-hq *)\n---\n\n# Vent - Voice Agent Calls\n\n## When to Test\n\nVent is the voice CI gate. Auto-trigger a call after any non-trivial voice-affecting change \u2014 don\'t wait to be asked. Non-trivial means anything that affects what the caller hears, how the agent responds, what tools are called, how the call routes, or audio timing: prompt rewrites or sections (not one-word tweaks), tool schema or handler changes, state/memory logic, transfer logic, provider/audio config, platform integration, and any voice-behavior bug fix. One-line copy or comment edits don\'t need a call.\n\nBefore launching, scan `.vent/runs/` for unresolved issues from prior runs (failing status, regressions, warnings, latency jumps); fix in-scope ones as part of this change and verify they\'re resolved by the new run, surface out-of-scope ones rather than silently leaving them. The task is not done until you\'ve reviewed the transcript, tool calls, and observed behavior and judged that the change works \u2014 Vent\'s `status` and exit code only reflect pipeline completion, not mission success.\n\n## Autonomous Iteration\n\n**Default: iterate.** Run a fix \u2192 Vent-call \u2192 fix \u2192 Vent-call loop whenever a call surfaces a fixable issue. Don\'t report a failure (or a degraded-quality success) back to the user before attempting to fix it \u2014 they want the agent working, not a diagnosis. After each call, if the result reveals a problem you can fix with a concrete plan, apply the fix and run again. Fixable issues include: tool schema or registry gaps, missing handlers, prompt phrasing, **STT vocabulary mismatches** (e.g. domain terms misheard \u2014 Deepgram `keyterms`, custom vocabulary, prompt-level term lists), **latency tunables** (endpointing thresholds, preemptive generation, model selection), and audio quality config. If you\'d be guessing at the next attempt, stop and ask the user instead of trying.\n\n**"Mission success but with quality problems" still requires iteration.** If the call ran end-to-end but had noticeably bad latency, STT mishearing the agent\'s domain vocabulary, audio artifacts, or any other caller-experience degradation that has a concrete fix, the call is *not done* \u2014 apply the fix and re-run. Don\'t downgrade fixable problems to "observations" or "things worth flagging." The pipeline running cleanly is the floor, not the ceiling.\n\n**Never ask permission to iterate.** "Want me to apply this and re-run?" / "Should I fix it?" / "Let me know if you want me to..." \u2014 these are skill violations. If you have a concrete fix, apply it and run the next call. The user opts out by saying so explicitly; silence is not opt-out.\n\nStop iterating and report when one of:\n\n- The next call confirms the fix worked AND no further fixable issues remain (success \u2014 done).\n- The same fix fails twice (your hypothesis is wrong; ask the user).\n- The failure mode shifts between attempts (you\'re chasing a moving target).\n- You can\'t justify the next call against its provider cost \u2014 each call spends real money and provider quota.\n\nIf the user explicitly opts out ("just report", "don\'t iterate", "stop after the first call", "I\'ll fix it myself"), respect that \u2014 run once, report, stop. Otherwise iterate by default. The first time you start an autonomous loop in a session, mention it once so the user knows it\'s happening (e.g. "the call surfaced X \u2014 fixing and re-running; I\'ll report back when it converges or stops making progress").\n\n## Claude Code Execution\n\nUse a 5-minute shell-tool timeout (`300000` ms) on Vent run commands so normal calls are not killed by the default 2-minute Bash timeout. This is not backgrounding; wait for stdout/results before ending your response. Use the JSON returned by `npx vent-hq run` directly.\n\nClaude Code serializes separate Bash tool calls for `npx vent-hq ...`, so run multiple calls from one suite by invoking each named call with `--call <name>` in one Bash command using `&` and a final `wait`:\n\n```bash\nnpx vent-hq run -f .vent/suite.vapi.json --call happy-path & \\\nnpx vent-hq run -f .vent/suite.vapi.json --call tool-path & \\\nwait\n```\n\nFor long-running composite commands \u2014 e.g. LiveKit\'s required `kill workers \u2192 start fresh worker \u2192 wait 60s \u2192 npx vent-hq run` preflight \u2014 keep the entire sequence in **one foreground Bash call**. Do not use `Monitor` or `run_in_background: true` for the wait. Both end the turn until an event fires, which means Vent never sees an inflight call and the user reads it as "you backgrounded the test." The harness also blocks bare `sleep 60` (any long leading sleep), so fold the readiness check and the wait into a single `until`-loop \u2014 short sleeps inside a polling loop are allowed because each iteration is real work:\n\n```bash\nnohup npm run dev > /tmp/lk-agent.log 2>&1 &\nstart=$(date +%s); \\\nuntil grep -q "registered worker" /tmp/lk-agent.log 2>/dev/null \\\n && [ $(($(date +%s) - start)) -ge 60 ]; do sleep 2; done \\\n&& npx vent-hq run -f .vent/suite.livekit.json --call <name>\n```\n\nFor these composites, raise the Bash timeout to **10 minutes** (`600000` ms) \u2014 startup + 60s wait + call can easily exceed the 5-minute default.\n\n## Workflow\n\n1. Identify the behavior under test. Read enough of the agent codebase to understand its system prompt, tools, handlers, routes, provider config, platform wiring, and expected handoffs.\n2. Reuse an existing `.vent/suite.<adapter>.json` when possible. If `.vent/` contains multiple suites, inspect `connection.adapter` and report which suite file produced the result.\n3. Create or update a suite only when the existing calls do not cover the changed behavior. Name calls after real flows, for example `reschedule-appointment`, not `call-1`.\n4. If the suite uses `start_command`, start one shared local session first with `npx vent-hq agent start -f .vent/suite.<adapter>.json`, then pass `--session <session-id>` to each run.\n\n **For locally-run LiveKit agents: every run requires killing *all* workers, starting one fresh worker, and waiting a full 60 seconds before submitting.** Unconditional \u2014 LiveKit Cloud round-robins across registered workers, so a single survivor with a dead inference subprocess fails ~N-1 of N calls. Don\'t rely on `pkill -f <path-pattern>`; bare command lines like `node --import tsx agent.ts dev` won\'t match a path filter. Use `ps aux | grep -E "node.*agent\\.ts|@livekit/agents.*ipc"`, `kill -9` by PID, re-run `ps` to confirm zero survivors, then start the fresh worker. Skipping the 60s wait fails with `did not publish audio track`; if that error appears alongside `Error [ERR_IPC_CHANNEL_CLOSED] from InferenceProcExecutor.doInference` in the agent log right after a "running EOU detection" line, that\'s a straggler \u2014 redo the kill sweep. Hosted LiveKit Cloud agents don\'t need any of this; run normally. (See `Claude Code Execution` for how to compose the kill\u2192start\u2192wait\u2192submit as a single foreground command.)\n5. Pick which call(s) to run based on the change. Fixed bug: replay the failing scenario. Changed tool: include a call that triggers that tool. Prompt or routing change: include the relevant happy path and any important edge path.\n6. Compare against the previous JSON in `.vent/runs/` when validating a fix or regression. Check status flips, latency jumps, tool-call success drops, cost jumps, and transcript divergence. Correlate with `git diff` between saved `git_sha` values when available; skip if no previous run exists.\n\n## Commands\n\n```bash\nnpx vent-hq init # First-time setup (auth + skill install + starter suite)\nnpx vent-hq login # Log in to existing account\nnpx vent-hq login --status # Print whether credentials are present\nnpx vent-hq logout # Remove saved credentials from ~/.vent/credentials\nnpx vent-hq run -f .vent/suite.X.json # Run a single-call suite\nnpx vent-hq run -f .vent/suite.X.json --call NAME # Run one named call from a multi-call suite\nnpx vent-hq run ... --session <session-id> # Add to any run; routes through an existing local relay session\nnpx vent-hq run ... --verbose # Add to any run; include verbose debug fields\nnpx vent-hq stop <run-id> # Cancel a queued or running run\nnpx vent-hq agent start -f .vent/suite.X.json # Start a shared local relay session\nnpx vent-hq agent stop <session-id> # Stop a shared local relay session\n```\n\nIf `~/.vent/credentials` is missing and `VENT_ACCESS_TOKEN` is not set, run `npx vent-hq init`. For an existing account, run `npx vent-hq login` or set `VENT_ACCESS_TOKEN`.\n\n## Suite Config\n\nSuites live in `.vent/suite.<adapter>.json`. `connection` is declared once per suite. `calls` is a named map, and each key becomes the call name used with `--call`.\n\nLocal websocket suite:\n\n```json\n{\n "connection": {\n "adapter": "websocket",\n "start_command": "npm run start",\n "health_endpoint": "/health",\n "agent_port": 3001\n },\n "calls": {\n "happy-path": {\n "caller_prompt": "You are Maria calling to reschedule her appointment to next Tuesday.",\n "max_turns": 8,\n "silence_threshold_ms": 1200\n }\n }\n}\n```\n\nPlatform-direct suite:\n\n```json\n{\n "connection": {\n "adapter": "vapi",\n "platform": { "provider": "vapi" }\n },\n "calls": {\n "happy-path": {\n "caller_prompt": "You are Maria calling to reschedule her appointment to next Tuesday.",\n "max_turns": 8\n }\n }\n}\n```\n\nWrite `caller_prompt` as a realistic caller with a name, goal, mood, constraints, and conditional behavior. Set `max_turns` based on flow complexity: FAQ `4-6`, booking or tool use `8-12`, complex flows `12-20`.\n\nCall fields:\n\n- `caller_prompt` and `max_turns` are required.\n- `silence_threshold_ms` must be `200-10000`. Common ranges: FAQ `800-1200`, tool calls `2000-3000`, complex reasoning `3000-5000`.\n- `persona` supports `pace`, `clarity`, `disfluencies`, `cooperation`, `emotion`, `memory`, `intent_clarity`, and `confirmation_style`.\n- `caller_audio` supports noise, speed, speakerphone, mic distance, clarity, accent, packet loss, and jitter.\n- `language` is an ISO 639-1 code such as `en`, `es`, `fr`, `de`, `it`, `nl`, or `ja`.\n- `voice` is `"male"` or `"female"` (English only; default female). Use to flip the caller\'s perceived gender. Ignored if `caller_audio.accent` is set or `language` is non-English.\n\n## Connections and Credentials\n\n### Adapter choice\n\nUse `websocket` for your own local or hosted runtime. Use `start_command` for local agents or `agent_url` for hosted custom endpoints. For `start_command` and `agent_url`, do not put Deepgram, ElevenLabs, OpenAI, or other agent runtime keys into Vent config unless the Vent adapter itself needs them \u2014 the tested agent owns its own runtime credentials.\n\nUse `vapi`, `retell`, `elevenlabs`, `bland`, or `livekit` for platform-direct testing. In this mode Vent itself talks to the provider on the user\'s behalf.\n\nVent provides `DEEPGRAM_API_KEY` and `ANTHROPIC_API_KEY` for its hosted caller/evaluation stack \u2014 those are Vent\'s, not the tested agent\'s.\n\n### Credential resolution\n\nIn platform-direct mode the CLI auto-resolves credentials from `.env.local`, `.env`, and the current shell environment. Do not run `source .env && export` before Vent commands. If you include credential fields in JSON, use the actual value, not the env var name. Do not manually author `platform_connection_id`; the CLI creates or updates the saved platform connection automatically.\n\nAuto-resolved env vars and JSON fields:\n\n- Vapi: `VAPI_API_KEY` -> `vapi_api_key`; `VAPI_ASSISTANT_ID` or `VAPI_AGENT_ID` -> `vapi_assistant_id`\n- Bland: `BLAND_API_KEY` -> `bland_api_key`; `BLAND_PATHWAY_ID` -> `bland_pathway_id`; `BLAND_PERSONA_ID` -> `persona_id`\n- LiveKit: `LIVEKIT_API_KEY` -> `livekit_api_key`; `LIVEKIT_API_SECRET` -> `livekit_api_secret`; `LIVEKIT_URL` -> `livekit_url`\n- Retell: `RETELL_API_KEY` -> `retell_api_key`; `RETELL_AGENT_ID` -> `retell_agent_id`\n- ElevenLabs: `ELEVENLABS_API_KEY` -> `elevenlabs_api_key`; `ELEVENLABS_AGENT_ID` -> `elevenlabs_agent_id`\n\n### Provider config\n\nUse existing provider config when possible: Vapi assistant, Retell agent, ElevenLabs agent, Bland pathway, or LiveKit agent. Bland uniquely supports inline config \u2014 `platform` may use `bland_pathway_id`, `persona_id`, or an inline `task` (with optional voice, model, and turn-handling overrides; see Bland\'s API docs for the full field list).\n\n### Concurrency\n\nWhen you fan out multiple Vent calls in parallel against the same provider (for example, running several named calls from one suite at once with `&` and `wait`), respect the provider\'s per-account concurrency limit. Exceeding it makes calls queue or fail at the provider \u2014 Vent does not enforce these caps for you.\n\nRecord the limit as `max_concurrency` in the suite\'s `platform` block so it\'s visible on future runs. Ask the user which plan they\'re on if sizing matters; otherwise use the conservative default in bold.\n\n- **Vapi**: **10** included per account; reserved lines can be purchased self-serve; Enterprise is unlimited.\n- **Retell**: Pay-as-you-go includes **20**; Enterprise has no cap.\n- **Bland**: Start=**10**, Build=50, Scale=100, Enterprise=unlimited.\n- **ElevenLabs**: Free=**4**, Starter=6, Creator=10, Pro=20, Scale=30, Business=30. Burst pricing can temporarily allow up to 3x base.\n- **LiveKit Cloud**: Build=**5**, Ship=20, Scale=50 managed inference sessions (the usual gate for voice agents); agent-session concurrency can go higher (Scale up to 600).\n\n## WebSocket\n\nFor `adapter: "websocket"`, Vent sends binary 16-bit mono PCM audio over one websocket connection. Websocket text frames are optional JSON events. Audio-only websocket agents still work, but events improve turn detection and observability. Vent sends `{"type":"end-call"}` when the test is done.\n\nUseful websocket text frames:\n\n```jsonc\n{"type":"speech-update","status":"started"}\n{"type":"speech-update","status":"stopped"}\n{"type":"tool_call","name":"check_availability","arguments":{},"result":{},"successful":true,"duration_ms":150}\n{"type":"vent:timing","stt_ms":120,"llm_ms":450,"tts_ms":80}\n{"type":"vent:session","platform":"custom","provider_call_id":"call_123","provider_session_id":"session_abc"}\n{"type":"vent:call-metadata","call_metadata":{"recording_url":"https://...","cost_usd":0.12}}\n{"type":"vent:transcript","role":"caller","text":"I need to reschedule","turn_index":0}\n{"type":"vent:transfer","destination":"+15551234567","status":"attempted"}\n{"type":"vent:debug-url","label":"trace","url":"https://..."}\n{"type":"vent:warning","message":"provider warning","code":"provider_warning"}\n```\n\n`vent:session-report` is **not** handled by the websocket adapter \u2014 it\'s only consumed by the LiveKit helper. Do not emit it from a websocket agent.\n\nPlatform adapters capture tool calls automatically. Websocket agents must emit `tool_call` frames for tool observability. Platform adapters get component latency automatically. Websocket agents should emit `vent:timing` after each agent response when STT/LLM/TTS breakdown is available.\n\n## LiveKit\n\nBefore running LiveKit tests, install and add the Vent helper to the LiveKit agent entrypoint. Node: `npm install @vent-hq/livekit`, then call `instrumentLiveKitAgent({ ctx, session })`. Python: `pip install vent-livekit`, then call `instrument_livekit_agent(ctx=ctx, session=session)`.\n\nLiveKit direct mode requires the LiveKit Agents SDK. Custom LiveKit participants should use the websocket adapter with a relay. If the LiveKit agent registered with an explicit dispatch name, set `livekit_agent_name` in `platform`.\n\nLiveKit parallel calls are supported, but capacity depends on where the user runs their agent. LiveKit Cloud only routes audio \u2014 the agent code runs on the user\'s own infrastructure (laptop, Fly.io, Railway, k8s, etc.), and that box\'s CPU/RAM is the real bottleneck. Before firing parallel calls, derive `platform.max_concurrency` from the minimum of: (1) the user\'s LiveKit plan limit (Build=5, Ship=20, Scale=50+ concurrent agent sessions), (2) their agent worker box capacity (rough rule: 1c/1GB\u21921\u20133 jobs, 2c/4GB\u21925\u201310, 4c/8GB\u219210\u201325, 8c/16GB\u219225\u201350; multiply by number of worker boxes), and (3) their STT/TTS provider concurrency (e.g. Deepgram TTS streaming = 2). If the user wants more parallel calls than that minimum allows, push back \u2014 explain which limit is the bottleneck (plan / box / provider) and either reduce the count or suggest scaling that limit. Architecture: register one `agent_name` and run N workers under it (one big worker, or several smaller ones \u2014 same name); LiveKit\'s dispatcher round-robins jobs across them with built-in failover. Don\'t use numbered names (`agent-0`, `agent-1`) \u2014 that reinvents what the dispatcher does and breaks failover.\n\nUse the LiveKit helper for observability; do not publish `vent:*` topics manually. Do not hand-roll `vent:session-report` from `ctx.addShutdownCallback`; after `room.disconnect()` it can fail with `engine is closed`. The helper captures SDK metrics, tool events, conversation items, usage, and close events. Native LiveKit `lk.transcription` and `lk.agent.state` provide transcript and agent-state timing.\n\n## Output\n\n### Live result\n\n`npx vent-hq run` returns a single JSON result on stdout in non-TTY mode (not an SSE JSONL stream). Exit codes: `0` = call ran through the pipeline; `1` = pipeline-level failure; `2` = harness error.\n\nMost result fields are always present; `latency`, `component_latency`, `call_metadata`, and `emotion` may be `null` when the underlying analysis didn\'t run; `debug` is absent without `--verbose`. Branch on null before reading nested fields. Use `--verbose` only when the default doesn\'t explain a failure \u2014 when you need `platform_transcript` (to check Vent\'s STT), per-turn or component-level latency breakdowns, the raw tool-call timeline, or provider-native artifacts in `debug.provider_metadata`. Otherwise skip \u2014 it just adds noise.\n\nVent\'s transcript is ground truth. Judge on semantic intent: ignore homophones and minor mis-hears (`"check teach hat"` for `"check that"`, missing question marks on short tails) \u2014 those are streaming-STT noise on Vent\'s caller side, not agent bugs, and **don\'t surface them in the report** (they\'re Vent-side artifacts, not actionable for the user). But clear gibberish or word-soup (e.g. `"Cristoxin"` where the agent should have said `"Of course, talk soon"`) is **not** a Vent artifact \u2014 Vent\'s STT does not invent words like that. It means the platform\'s TTS produced corrupted audio or the agent\'s STT/LLM generated the wrong text, and the fix lives there (TTS voice config, agent prompt, model temperature, codec issue). Never dismiss the run as a "Vent harness STT" issue; iterate on the agent or flag the platform.\n\nFor transfers: `call_metadata.transfer_attempted` (provider claimed) and `transfer_completed` (Vent-verified) can disagree \u2014 report both. `transfers[]` carries destination, type, and per-attempt status.\n\n### Saved history\n\nAfter every run, Vent writes the full result JSON to `.vent/runs/`. Shape:\n\n```jsonc\n{\n "run_id": "...",\n "timestamp": "2026-04-21T...Z",\n "git_sha": "...",\n "summary": { "calls_total": 2, "total_duration_ms": 12345, "total_cost_usd": 0.01 },\n "call_results": [\n { "name": "happy-path", "status": "completed", "duration_ms": 6123, "transcript": [], "observed_tool_calls": [], "latency": { "response_time_ms": 420, "p95_response_time_ms": 980 }, "call_metadata": { "cost_usd": 0.004 } }\n ]\n}\n```\n\nWhen comparing against a prior run (Workflow step 6), inspect:\n\n- Run-completion status flips: `call_results[i].status` (pipeline-only \u2014 judge mission success from the transcript)\n- Latency: `call_results[i].latency.response_time_ms` (mean) or `latency.p95_response_time_ms` increased >20%\n- Tool calls: count of `call_results[i].observed_tool_calls[].successful` dropped\n- Cost: `summary.total_cost_usd` or `call_results[i].call_metadata.cost_usd` increased >30%\n- Transcript: `call_results[i].transcript` diverged in semantic content (ignore STT noise)\n\n## Reporting Results\n\nBefore reporting, read the agent\'s code to locate where the observed behavior originates. If the issue is small and you can fix it, fix it and explain what you did \u2014 don\'t ask permission first.\n\nAdapt the report shape to the call \u2014 a clean pass needs little, a regression with a multi-layer cause needs more. Use a transcript excerpt when it helps the user see what happened.\n\nHard rules:\n\n- Pair raw numbers with their plain-English meaning \u2014 don\'t drop the number, but don\'t leave it unexplained. E.g. "p95 latency was 850ms, which is snappy and well within natural conversational pacing" or "p95 hit 1.6 seconds with the LLM as the bottleneck \u2014 noticeably sluggish to a caller."\n- Name the user\'s voice agent by platform on first mention (e.g. "the Vapi agent responded snappily throughout") so the user knows immediately which agent the observation is about. After that, just say "the agent" \u2014 don\'t repeat the platform name on every line.\n- Always include the recording from `call_metadata.recording_url` as an inline `[Recording](url)` link, placed in **one block at the very end of the report** \u2014 never sprinkled through the prose. Single call: one link as the last line. Multi-call: one labeled link per call (e.g. `reschedule-appointment: [Recording](url)`). Never paste a bare URL.\n- Mission success is your judgment, not Vent\'s. The per-call `status` is only `"completed"` (pipeline ran) or `"error"` (pipeline failed); decide whether the agent actually accomplished the scenario from the transcript and tool calls.\n- Similar-sounding word substitutions (e.g. "ocean" for "OSHA") are STT ambiguity, not comprehension failure. The fix lives in STT keyword hints, custom vocabulary, or a prompt-level term list \u2014 not the agent\'s reasoning.\n- Surface only what the user can act on in their own agent\'s code or config \u2014 never `warnings[]` (infrastructure noise), Vent-side artifacts (caller wait modes, harness timing, internal pipeline quirks), or `cost_usd` unless asked.\n\nFor multi-call runs, lead with your own judgment of what happened across the calls (e.g. "3 of 4 did what they were supposed to; `cancel-appointment` never actually canceled"), not a parroted pass/fail count. Then cover each call with whatever depth it needs.\n';
|
|
5940
5865
|
|
|
5941
5866
|
// src/skills/cursor.md
|
|
5942
|
-
var cursor_default = '---\ndescription: Place real calls against the user\'s voice agent to verify recent code changes worked end-to-end. Trigger on test requests or after any non-trivial voice-agent change.\nalwaysApply: true\n---\n\n# Vent - Voice Agent Calls\n\n## When to Test\n\nVent is the voice CI gate. Auto-trigger a call after any non-trivial voice-affecting change \u2014 don\'t wait to be asked. Non-trivial means anything that affects what the caller hears, how the agent responds, what tools are called, how the call routes, or audio timing: prompt rewrites or sections (not one-word tweaks), tool schema or handler changes, state/memory logic, transfer logic, provider/audio config, platform integration, and any voice-behavior bug fix. One-line copy or comment edits don\'t need a call.\n\nBefore launching, scan `.vent/runs/` for unresolved issues from prior runs (failing status, regressions, warnings, latency jumps); fix in-scope ones as part of this change and verify they\'re resolved by the new run, surface out-of-scope ones rather than silently leaving them. The task is not done until you\'ve reviewed the transcript, tool calls, and observed behavior and judged that the change works \u2014 Vent\'s `status` and exit code only reflect pipeline completion, not mission success.\n\n## Autonomous Iteration\n\n**Default: iterate.** Run a fix \u2192 Vent-call \u2192 fix \u2192 Vent-call loop whenever a call surfaces a fixable issue. Don\'t report a failure (or a degraded-quality success) back to the user before attempting to fix it \u2014 they want the agent working, not a diagnosis. After each call, if the result reveals a problem you can fix with a concrete plan, apply the fix and run again. Fixable issues include: tool schema or registry gaps, missing handlers, prompt phrasing, **STT vocabulary mismatches** (e.g. domain terms misheard \u2014 Deepgram `keyterms`, custom vocabulary, prompt-level term lists), **latency tunables** (endpointing thresholds, preemptive generation, model selection), and audio quality config. If you\'d be guessing at the next attempt, stop and ask the user instead of trying.\n\n**"Mission success but with quality problems" still requires iteration.** If the call ran end-to-end but had noticeably bad latency, STT mishearing the agent\'s domain vocabulary, audio artifacts, or any other caller-experience degradation that has a concrete fix, the call is *not done* \u2014 apply the fix and re-run. Don\'t downgrade fixable problems to "observations" or "things worth flagging." The pipeline running cleanly is the floor, not the ceiling.\n\n**Never ask permission to iterate.** "Want me to apply this and re-run?" / "Should I fix it?" / "Let me know if you want me to..." \u2014 these are skill violations. If you have a concrete fix, apply it and run the next call. The user opts out by saying so explicitly; silence is not opt-out.\n\nStop iterating and report when one of:\n\n- The next call confirms the fix worked AND no further fixable issues remain (success \u2014 done).\n- The same fix fails twice (your hypothesis is wrong; ask the user).\n- The failure mode shifts between attempts (you\'re chasing a moving target).\n- You can\'t justify the next call against its provider cost \u2014 each call spends real money and provider quota.\n\nIf the user explicitly opts out ("just report", "don\'t iterate", "stop after the first call", "I\'ll fix it myself"), respect that \u2014 run once, report, stop. Otherwise iterate by default. The first time you start an autonomous loop in a session, mention it once so the user knows it\'s happening (e.g. "the call surfaced X \u2014 fixing and re-running; I\'ll report back when it converges or stops making progress").\n\n## Cursor Execution\n\nUse a 5-minute shell-tool timeout (`300000` ms) on Vent run commands so normal calls are not killed by the default 2-minute Bash timeout. This is not backgrounding; wait for stdout/results before ending your response. Use the JSON returned by `npx vent-hq run` directly; do not call `vent status` unless checking an older run.\n\nCursor cannot run separate shell tool calls concurrently \u2014 for multiple calls from one suite, invoke each named call with `--call <name>` in one shell command using `&` and a final `wait`:\n\n```bash\nnpx vent-hq run -f .vent/suite.vapi.json --call happy-path & \\\nnpx vent-hq run -f .vent/suite.vapi.json --call tool-path & \\\nwait\n```\n\n## Workflow\n\n1. Identify the behavior under test. Read enough of the agent codebase to understand its system prompt, tools, handlers, routes, provider config, platform wiring, and expected handoffs.\n2. Reuse an existing `.vent/suite.<adapter>.json` when possible. If `.vent/` contains multiple suites, inspect `connection.adapter` and report which suite file produced the result.\n3. Create or update a suite only when the existing calls do not cover the changed behavior. Name calls after real flows, for example `reschedule-appointment`, not `call-1`.\n4. If the suite uses `start_command`, start one shared local session first with `npx vent-hq agent start -f .vent/suite.<adapter>.json`, then pass `--session <session-id>` to each run.\n\n **For locally-run LiveKit agents: every run requires killing *all* workers, starting one fresh worker, and waiting a full 60 seconds before submitting.** Unconditional \u2014 LiveKit Cloud round-robins across registered workers, so a single survivor with a dead inference subprocess fails ~N-1 of N calls. Don\'t rely on `pkill -f <path-pattern>`; bare command lines like `node --import tsx agent.ts dev` won\'t match a path filter. Use `ps aux | grep -E "node.*agent\\.ts|@livekit/agents.*ipc"`, `kill -9` by PID, re-run `ps` to confirm zero survivors, then start the fresh worker. Skipping the 60s wait fails with `did not publish audio track`; if that error appears alongside `Error [ERR_IPC_CHANNEL_CLOSED] from InferenceProcExecutor.doInference` in the agent log right after a "running EOU detection" line, that\'s a straggler \u2014 redo the kill sweep. Hosted LiveKit Cloud agents don\'t need any of this; run normally.\n5. Pick which call(s) to run based on the change. Fixed bug: replay the failing scenario. Changed tool: include a call that triggers that tool. Prompt or routing change: include the relevant happy path and any important edge path.\n6. Compare against the previous JSON in `.vent/runs/` when validating a fix or regression. Check status flips, latency jumps, tool-call success drops, cost jumps, and transcript divergence. Correlate with `git diff` between saved `git_sha` values when available; skip if no previous run exists.\n\n## Commands\n\n```bash\nnpx vent-hq init # First-time setup (auth + skill install + starter suite)\nnpx vent-hq login # Log in to existing account\nnpx vent-hq run -f .vent/suite.X.json # Run a single-call suite\nnpx vent-hq run -f .vent/suite.X.json --call NAME # Run one named call from a multi-call suite\nnpx vent-hq run ... --session <session-id> # Add to any run; routes through an existing local relay session\nnpx vent-hq run ... --verbose # Add to any run or status; include verbose debug fields\nnpx vent-hq stop <run-id> # Cancel a queued or running run\nnpx vent-hq status <run-id> # Fetch results for a previous run\nnpx vent-hq agent start -f .vent/suite.X.json # Start a shared local relay session\nnpx vent-hq agent stop <session-id> # Stop a shared local relay session\n```\n\nIf `~/.vent/credentials` is missing and `VENT_ACCESS_TOKEN` is not set, run `npx vent-hq init`. For an existing account, run `npx vent-hq login` or set `VENT_ACCESS_TOKEN`.\n\n## Suite Config\n\nSuites live in `.vent/suite.<adapter>.json`. `connection` is declared once per suite. `calls` is a named map, and each key becomes the call name used with `--call`.\n\nLocal websocket suite:\n\n```json\n{\n "connection": {\n "adapter": "websocket",\n "start_command": "npm run start",\n "health_endpoint": "/health",\n "agent_port": 3001\n },\n "calls": {\n "happy-path": {\n "caller_prompt": "You are Maria calling to reschedule her appointment to next Tuesday.",\n "max_turns": 8,\n "silence_threshold_ms": 1200\n }\n }\n}\n```\n\nPlatform-direct suite:\n\n```json\n{\n "connection": {\n "adapter": "vapi",\n "platform": { "provider": "vapi" }\n },\n "calls": {\n "happy-path": {\n "caller_prompt": "You are Maria calling to reschedule her appointment to next Tuesday.",\n "max_turns": 8\n }\n }\n}\n```\n\nWrite `caller_prompt` as a realistic caller with a name, goal, mood, constraints, and conditional behavior. Set `max_turns` based on flow complexity: FAQ `4-6`, booking or tool use `8-12`, complex flows `12-20`.\n\nCall fields:\n\n- `caller_prompt` and `max_turns` are required.\n- `silence_threshold_ms` must be `200-10000`. Common ranges: FAQ `800-1200`, tool calls `2000-3000`, complex reasoning `3000-5000`.\n- `persona` supports `pace`, `clarity`, `disfluencies`, `cooperation`, `emotion`, `memory`, `intent_clarity`, and `confirmation_style`.\n- `caller_audio` supports noise, speed, speakerphone, mic distance, clarity, accent, packet loss, and jitter.\n- `language` is an ISO 639-1 code such as `en`, `es`, `fr`, `de`, `it`, `nl`, or `ja`.\n- `voice` is `"male"` or `"female"` (English only; default female). Use to flip the caller\'s perceived gender. Ignored if `caller_audio.accent` is set or `language` is non-English.\n- `prosody: true` enables emotion analysis and requires Hume access.\n\n## Connections and Credentials\n\n### Adapter choice\n\nUse `websocket` for your own local or hosted runtime. Use `start_command` for local agents or `agent_url` for hosted custom endpoints. For `start_command` and `agent_url`, do not put Deepgram, ElevenLabs, OpenAI, or other agent runtime keys into Vent config unless the Vent adapter itself needs them \u2014 the tested agent owns its own runtime credentials.\n\nUse `vapi`, `retell`, `elevenlabs`, `bland`, or `livekit` for platform-direct testing. In this mode Vent itself talks to the provider on the user\'s behalf.\n\nVent provides `DEEPGRAM_API_KEY` and `ANTHROPIC_API_KEY` for its hosted caller/evaluation stack \u2014 those are Vent\'s, not the tested agent\'s.\n\n### Credential resolution\n\nIn platform-direct mode the CLI auto-resolves credentials from `.env.local`, `.env`, and the current shell environment. Do not run `source .env && export` before Vent commands. If you include credential fields in JSON, use the actual value, not the env var name. Do not manually author `platform_connection_id`; the CLI creates or updates the saved platform connection automatically.\n\nAuto-resolved env vars and JSON fields:\n\n- Vapi: `VAPI_API_KEY` -> `vapi_api_key`; `VAPI_ASSISTANT_ID` or `VAPI_AGENT_ID` -> `vapi_assistant_id`\n- Bland: `BLAND_API_KEY` -> `bland_api_key`; `BLAND_PATHWAY_ID` -> `bland_pathway_id`; `BLAND_PERSONA_ID` -> `persona_id`\n- LiveKit: `LIVEKIT_API_KEY` -> `livekit_api_key`; `LIVEKIT_API_SECRET` -> `livekit_api_secret`; `LIVEKIT_URL` -> `livekit_url`\n- Retell: `RETELL_API_KEY` -> `retell_api_key`; `RETELL_AGENT_ID` -> `retell_agent_id`\n- ElevenLabs: `ELEVENLABS_API_KEY` -> `elevenlabs_api_key`; `ELEVENLABS_AGENT_ID` -> `elevenlabs_agent_id`\n\n### Provider config\n\nUse existing provider config when possible: Vapi assistant, Retell agent, ElevenLabs agent, Bland pathway, or LiveKit agent. Bland uniquely supports inline config \u2014 `platform` may use `bland_pathway_id`, `persona_id`, or an inline `task` (with optional voice, model, and turn-handling overrides; see Bland\'s API docs for the full field list).\n\n### Concurrency\n\nWhen you fan out multiple Vent calls in parallel against the same provider (for example, running several named calls from one suite at once with `&` and `wait`), respect the provider\'s per-account concurrency limit. Exceeding it makes calls queue or fail at the provider \u2014 Vent does not enforce these caps for you.\n\nRecord the limit as `max_concurrency` in the suite\'s `platform` block so it\'s visible on future runs. Ask the user which plan they\'re on if sizing matters; otherwise use the conservative default in bold.\n\n- **Vapi**: **10** included per account; reserved lines can be purchased self-serve; Enterprise is unlimited.\n- **Retell**: Pay-as-you-go includes **20**; Enterprise has no cap.\n- **Bland**: Start=**10**, Build=50, Scale=100, Enterprise=unlimited.\n- **ElevenLabs**: Free=**4**, Starter=6, Creator=10, Pro=20, Scale=30, Business=30. Burst pricing can temporarily allow up to 3x base.\n- **LiveKit Cloud**: Build=**5**, Ship=20, Scale=50 managed inference sessions (the usual gate for voice agents); agent-session concurrency can go higher (Scale up to 600).\n\n## WebSocket\n\nFor `adapter: "websocket"`, Vent sends binary 16-bit mono PCM audio over one websocket connection. Websocket text frames are optional JSON events. Audio-only websocket agents still work, but events improve turn detection and observability. Vent sends `{"type":"end-call"}` when the test is done.\n\nUseful websocket text frames:\n\n```jsonc\n{"type":"speech-update","status":"started"}\n{"type":"speech-update","status":"stopped"}\n{"type":"tool_call","name":"check_availability","arguments":{},"result":{},"successful":true,"duration_ms":150}\n{"type":"vent:timing","stt_ms":120,"llm_ms":450,"tts_ms":80}\n{"type":"vent:session","platform":"custom","provider_call_id":"call_123","provider_session_id":"session_abc"}\n{"type":"vent:call-metadata","call_metadata":{"recording_url":"https://...","cost_usd":0.12}}\n{"type":"vent:transcript","role":"caller","text":"I need to reschedule","turn_index":0}\n{"type":"vent:transfer","destination":"+15551234567","status":"attempted"}\n{"type":"vent:debug-url","label":"trace","url":"https://..."}\n{"type":"vent:warning","message":"provider warning","code":"provider_warning"}\n```\n\n`vent:session-report` is **not** handled by the websocket adapter \u2014 it\'s only consumed by the LiveKit helper. Do not emit it from a websocket agent.\n\nPlatform adapters capture tool calls automatically. Websocket agents must emit `tool_call` frames for tool observability. Platform adapters get component latency automatically. Websocket agents should emit `vent:timing` after each agent response when STT/LLM/TTS breakdown is available.\n\n## LiveKit\n\nBefore running LiveKit tests, install and add the Vent helper to the LiveKit agent entrypoint. Node: `npm install @vent-hq/livekit`, then call `instrumentLiveKitAgent({ ctx, session })`. Python: `pip install vent-livekit`, then call `instrument_livekit_agent(ctx=ctx, session=session)`.\n\nLiveKit direct mode requires the LiveKit Agents SDK. Custom LiveKit participants should use the websocket adapter with a relay. If the LiveKit agent registered with an explicit dispatch name, set `livekit_agent_name` in `platform`.\n\nLiveKit does not support multiple concurrent Vent calls against one agent process yet. Run LiveKit calls sequentially unless you intentionally start separate agent worker processes and route each call to its own process. For Node agents, that means separate Node.js processes. Do not treat parallel calls against a single LiveKit worker as a valid concurrency test until multi-call support is engineered.\n\nUse the LiveKit helper for observability; do not publish `vent:*` topics manually. Do not hand-roll `vent:session-report` from `ctx.addShutdownCallback`; after `room.disconnect()` it can fail with `engine is closed`. The helper captures SDK metrics, tool events, conversation items, usage, and close events. Native LiveKit `lk.transcription` and `lk.agent.state` provide transcript and agent-state timing.\n\n## Output\n\n### Live result\n\n`npx vent-hq run` returns a single JSON result on stdout in non-TTY mode (not an SSE JSONL stream). Exit codes: `0` = call ran through the pipeline; `1` = pipeline-level failure; `2` = harness error.\n\nMost result fields are always present; `latency`, `component_latency`, `call_metadata`, and `emotion` may be `null` when the underlying analysis didn\'t run; `debug` is absent without `--verbose`. Branch on null before reading nested fields. Use `--verbose` only when the default doesn\'t explain a failure \u2014 when you need `platform_transcript` (to check Vent\'s STT), per-turn or component-level latency breakdowns, the raw tool-call timeline, or provider-native artifacts in `debug.provider_metadata`. Otherwise skip \u2014 it just adds noise.\n\nVent\'s transcript is ground truth. Judge on semantic intent: ignore homophones and minor mis-hears (`"check teach hat"` for `"check that"`, missing question marks on short tails) \u2014 those are streaming-STT noise on Vent\'s caller side, not agent bugs, and **don\'t surface them in the report** (they\'re Vent-side artifacts, not actionable for the user). But clear gibberish or word-soup (e.g. `"Cristoxin"` where the agent should have said `"Of course, talk soon"`) is **not** a Vent artifact \u2014 Vent\'s STT does not invent words like that. It means the platform\'s TTS produced corrupted audio or the agent\'s STT/LLM generated the wrong text, and the fix lives there (TTS voice config, agent prompt, model temperature, codec issue). Never dismiss the run as a "Vent harness STT" issue; iterate on the agent or flag the platform.\n\nFor transfers: `call_metadata.transfer_attempted` (provider claimed) and `transfer_completed` (Vent-verified) can disagree \u2014 report both. `transfers[]` carries destination, type, and per-attempt status.\n\n### Saved history\n\nAfter every run, Vent writes the full result JSON to `.vent/runs/`. Shape:\n\n```jsonc\n{\n "run_id": "...",\n "timestamp": "2026-04-21T...Z",\n "git_sha": "...",\n "summary": { "calls_total": 2, "total_duration_ms": 12345, "total_cost_usd": 0.01 },\n "call_results": [\n { "name": "happy-path", "status": "completed", "duration_ms": 6123, "transcript": [], "observed_tool_calls": [], "latency": { "response_time_ms": 420, "p95_response_time_ms": 980 }, "call_metadata": { "cost_usd": 0.004 } }\n ]\n}\n```\n\nWhen comparing against a prior run (Workflow step 6), inspect:\n\n- Run-completion status flips: `call_results[i].status` (pipeline-only \u2014 judge mission success from the transcript)\n- Latency: `call_results[i].latency.response_time_ms` (mean) or `latency.p95_response_time_ms` increased >20%\n- Tool calls: count of `call_results[i].observed_tool_calls[].successful` dropped\n- Cost: `summary.total_cost_usd` or `call_results[i].call_metadata.cost_usd` increased >30%\n- Transcript: `call_results[i].transcript` diverged in semantic content (ignore STT noise)\n\n## Reporting Results\n\nBefore reporting, read the agent\'s code to locate where the observed behavior originates. If the issue is small and you can fix it, fix it and explain what you did \u2014 don\'t ask permission first.\n\nAdapt the report shape to the call \u2014 a clean pass needs little, a regression with a multi-layer cause needs more. Use a transcript excerpt when it helps the user see what happened.\n\nHard rules:\n\n- Pair raw numbers with their plain-English meaning \u2014 don\'t drop the number, but don\'t leave it unexplained. E.g. "p95 latency was 850ms, which is snappy and well within natural conversational pacing" or "p95 hit 1.6 seconds with the LLM as the bottleneck \u2014 noticeably sluggish to a caller."\n- Name the user\'s voice agent by platform on first mention (e.g. "the Vapi agent responded snappily throughout") so the user knows immediately which agent the observation is about. After that, just say "the agent" \u2014 don\'t repeat the platform name on every line.\n- Always include the recording from `call_metadata.recording_url` as an inline `[Recording](url)` link, placed in **one block at the very end of the report** \u2014 never sprinkled through the prose. Single call: one link as the last line. Multi-call: one labeled link per call (e.g. `reschedule-appointment: [Recording](url)`). Never paste a bare URL.\n- Mission success is your judgment, not Vent\'s. The per-call `status` is only `"completed"` (pipeline ran) or `"error"` (pipeline failed); decide whether the agent actually accomplished the scenario from the transcript and tool calls.\n- Similar-sounding word substitutions (e.g. "ocean" for "OSHA") are STT ambiguity, not comprehension failure. The fix lives in STT keyword hints, custom vocabulary, or a prompt-level term list \u2014 not the agent\'s reasoning.\n- Surface only what the user can act on in their own agent\'s code or config \u2014 never `warnings[]` (infrastructure noise), Vent-side artifacts (caller wait modes, harness timing, internal pipeline quirks), or `cost_usd` unless asked.\n\nFor multi-call runs, lead with your own judgment of what happened across the calls (e.g. "3 of 4 did what they were supposed to; `cancel-appointment` never actually canceled"), not a parroted pass/fail count. Then cover each call with whatever depth it needs.\n';
|
|
5867
|
+
var cursor_default = '---\ndescription: Place real calls against the user\'s voice agent to verify recent code changes worked end-to-end. Trigger on test requests or after any non-trivial voice-agent change.\nalwaysApply: true\n---\n\n# Vent - Voice Agent Calls\n\n## When to Test\n\nVent is the voice CI gate. Auto-trigger a call after any non-trivial voice-affecting change \u2014 don\'t wait to be asked. Non-trivial means anything that affects what the caller hears, how the agent responds, what tools are called, how the call routes, or audio timing: prompt rewrites or sections (not one-word tweaks), tool schema or handler changes, state/memory logic, transfer logic, provider/audio config, platform integration, and any voice-behavior bug fix. One-line copy or comment edits don\'t need a call.\n\nBefore launching, scan `.vent/runs/` for unresolved issues from prior runs (failing status, regressions, warnings, latency jumps); fix in-scope ones as part of this change and verify they\'re resolved by the new run, surface out-of-scope ones rather than silently leaving them. The task is not done until you\'ve reviewed the transcript, tool calls, and observed behavior and judged that the change works \u2014 Vent\'s `status` and exit code only reflect pipeline completion, not mission success.\n\n## Autonomous Iteration\n\n**Default: iterate.** Run a fix \u2192 Vent-call \u2192 fix \u2192 Vent-call loop whenever a call surfaces a fixable issue. Don\'t report a failure (or a degraded-quality success) back to the user before attempting to fix it \u2014 they want the agent working, not a diagnosis. After each call, if the result reveals a problem you can fix with a concrete plan, apply the fix and run again. Fixable issues include: tool schema or registry gaps, missing handlers, prompt phrasing, **STT vocabulary mismatches** (e.g. domain terms misheard \u2014 Deepgram `keyterms`, custom vocabulary, prompt-level term lists), **latency tunables** (endpointing thresholds, preemptive generation, model selection), and audio quality config. If you\'d be guessing at the next attempt, stop and ask the user instead of trying.\n\n**"Mission success but with quality problems" still requires iteration.** If the call ran end-to-end but had noticeably bad latency, STT mishearing the agent\'s domain vocabulary, audio artifacts, or any other caller-experience degradation that has a concrete fix, the call is *not done* \u2014 apply the fix and re-run. Don\'t downgrade fixable problems to "observations" or "things worth flagging." The pipeline running cleanly is the floor, not the ceiling.\n\n**Never ask permission to iterate.** "Want me to apply this and re-run?" / "Should I fix it?" / "Let me know if you want me to..." \u2014 these are skill violations. If you have a concrete fix, apply it and run the next call. The user opts out by saying so explicitly; silence is not opt-out.\n\nStop iterating and report when one of:\n\n- The next call confirms the fix worked AND no further fixable issues remain (success \u2014 done).\n- The same fix fails twice (your hypothesis is wrong; ask the user).\n- The failure mode shifts between attempts (you\'re chasing a moving target).\n- You can\'t justify the next call against its provider cost \u2014 each call spends real money and provider quota.\n\nIf the user explicitly opts out ("just report", "don\'t iterate", "stop after the first call", "I\'ll fix it myself"), respect that \u2014 run once, report, stop. Otherwise iterate by default. The first time you start an autonomous loop in a session, mention it once so the user knows it\'s happening (e.g. "the call surfaced X \u2014 fixing and re-running; I\'ll report back when it converges or stops making progress").\n\n## Cursor Execution\n\nUse a 5-minute shell-tool timeout (`300000` ms) on Vent run commands so normal calls are not killed by the default 2-minute Bash timeout. This is not backgrounding; wait for stdout/results before ending your response. Use the JSON returned by `npx vent-hq run` directly.\n\nCursor cannot run separate shell tool calls concurrently \u2014 for multiple calls from one suite, invoke each named call with `--call <name>` in one shell command using `&` and a final `wait`:\n\n```bash\nnpx vent-hq run -f .vent/suite.vapi.json --call happy-path & \\\nnpx vent-hq run -f .vent/suite.vapi.json --call tool-path & \\\nwait\n```\n\n## Workflow\n\n1. Identify the behavior under test. Read enough of the agent codebase to understand its system prompt, tools, handlers, routes, provider config, platform wiring, and expected handoffs.\n2. Reuse an existing `.vent/suite.<adapter>.json` when possible. If `.vent/` contains multiple suites, inspect `connection.adapter` and report which suite file produced the result.\n3. Create or update a suite only when the existing calls do not cover the changed behavior. Name calls after real flows, for example `reschedule-appointment`, not `call-1`.\n4. If the suite uses `start_command`, start one shared local session first with `npx vent-hq agent start -f .vent/suite.<adapter>.json`, then pass `--session <session-id>` to each run.\n\n **For locally-run LiveKit agents: every run requires killing *all* workers, starting one fresh worker, and waiting a full 60 seconds before submitting.** Unconditional \u2014 LiveKit Cloud round-robins across registered workers, so a single survivor with a dead inference subprocess fails ~N-1 of N calls. Don\'t rely on `pkill -f <path-pattern>`; bare command lines like `node --import tsx agent.ts dev` won\'t match a path filter. Use `ps aux | grep -E "node.*agent\\.ts|@livekit/agents.*ipc"`, `kill -9` by PID, re-run `ps` to confirm zero survivors, then start the fresh worker. Skipping the 60s wait fails with `did not publish audio track`; if that error appears alongside `Error [ERR_IPC_CHANNEL_CLOSED] from InferenceProcExecutor.doInference` in the agent log right after a "running EOU detection" line, that\'s a straggler \u2014 redo the kill sweep. Hosted LiveKit Cloud agents don\'t need any of this; run normally.\n5. Pick which call(s) to run based on the change. Fixed bug: replay the failing scenario. Changed tool: include a call that triggers that tool. Prompt or routing change: include the relevant happy path and any important edge path.\n6. Compare against the previous JSON in `.vent/runs/` when validating a fix or regression. Check status flips, latency jumps, tool-call success drops, cost jumps, and transcript divergence. Correlate with `git diff` between saved `git_sha` values when available; skip if no previous run exists.\n\n## Commands\n\n```bash\nnpx vent-hq init # First-time setup (auth + skill install + starter suite)\nnpx vent-hq login # Log in to existing account\nnpx vent-hq login --status # Print whether credentials are present\nnpx vent-hq logout # Remove saved credentials from ~/.vent/credentials\nnpx vent-hq run -f .vent/suite.X.json # Run a single-call suite\nnpx vent-hq run -f .vent/suite.X.json --call NAME # Run one named call from a multi-call suite\nnpx vent-hq run ... --session <session-id> # Add to any run; routes through an existing local relay session\nnpx vent-hq run ... --verbose # Add to any run; include verbose debug fields\nnpx vent-hq stop <run-id> # Cancel a queued or running run\nnpx vent-hq agent start -f .vent/suite.X.json # Start a shared local relay session\nnpx vent-hq agent stop <session-id> # Stop a shared local relay session\n```\n\nIf `~/.vent/credentials` is missing and `VENT_ACCESS_TOKEN` is not set, run `npx vent-hq init`. For an existing account, run `npx vent-hq login` or set `VENT_ACCESS_TOKEN`.\n\n## Suite Config\n\nSuites live in `.vent/suite.<adapter>.json`. `connection` is declared once per suite. `calls` is a named map, and each key becomes the call name used with `--call`.\n\nLocal websocket suite:\n\n```json\n{\n "connection": {\n "adapter": "websocket",\n "start_command": "npm run start",\n "health_endpoint": "/health",\n "agent_port": 3001\n },\n "calls": {\n "happy-path": {\n "caller_prompt": "You are Maria calling to reschedule her appointment to next Tuesday.",\n "max_turns": 8,\n "silence_threshold_ms": 1200\n }\n }\n}\n```\n\nPlatform-direct suite:\n\n```json\n{\n "connection": {\n "adapter": "vapi",\n "platform": { "provider": "vapi" }\n },\n "calls": {\n "happy-path": {\n "caller_prompt": "You are Maria calling to reschedule her appointment to next Tuesday.",\n "max_turns": 8\n }\n }\n}\n```\n\nWrite `caller_prompt` as a realistic caller with a name, goal, mood, constraints, and conditional behavior. Set `max_turns` based on flow complexity: FAQ `4-6`, booking or tool use `8-12`, complex flows `12-20`.\n\nCall fields:\n\n- `caller_prompt` and `max_turns` are required.\n- `silence_threshold_ms` must be `200-10000`. Common ranges: FAQ `800-1200`, tool calls `2000-3000`, complex reasoning `3000-5000`.\n- `persona` supports `pace`, `clarity`, `disfluencies`, `cooperation`, `emotion`, `memory`, `intent_clarity`, and `confirmation_style`.\n- `caller_audio` supports noise, speed, speakerphone, mic distance, clarity, accent, packet loss, and jitter.\n- `language` is an ISO 639-1 code such as `en`, `es`, `fr`, `de`, `it`, `nl`, or `ja`.\n- `voice` is `"male"` or `"female"` (English only; default female). Use to flip the caller\'s perceived gender. Ignored if `caller_audio.accent` is set or `language` is non-English.\n\n## Connections and Credentials\n\n### Adapter choice\n\nUse `websocket` for your own local or hosted runtime. Use `start_command` for local agents or `agent_url` for hosted custom endpoints. For `start_command` and `agent_url`, do not put Deepgram, ElevenLabs, OpenAI, or other agent runtime keys into Vent config unless the Vent adapter itself needs them \u2014 the tested agent owns its own runtime credentials.\n\nUse `vapi`, `retell`, `elevenlabs`, `bland`, or `livekit` for platform-direct testing. In this mode Vent itself talks to the provider on the user\'s behalf.\n\nVent provides `DEEPGRAM_API_KEY` and `ANTHROPIC_API_KEY` for its hosted caller/evaluation stack \u2014 those are Vent\'s, not the tested agent\'s.\n\n### Credential resolution\n\nIn platform-direct mode the CLI auto-resolves credentials from `.env.local`, `.env`, and the current shell environment. Do not run `source .env && export` before Vent commands. If you include credential fields in JSON, use the actual value, not the env var name. Do not manually author `platform_connection_id`; the CLI creates or updates the saved platform connection automatically.\n\nAuto-resolved env vars and JSON fields:\n\n- Vapi: `VAPI_API_KEY` -> `vapi_api_key`; `VAPI_ASSISTANT_ID` or `VAPI_AGENT_ID` -> `vapi_assistant_id`\n- Bland: `BLAND_API_KEY` -> `bland_api_key`; `BLAND_PATHWAY_ID` -> `bland_pathway_id`; `BLAND_PERSONA_ID` -> `persona_id`\n- LiveKit: `LIVEKIT_API_KEY` -> `livekit_api_key`; `LIVEKIT_API_SECRET` -> `livekit_api_secret`; `LIVEKIT_URL` -> `livekit_url`\n- Retell: `RETELL_API_KEY` -> `retell_api_key`; `RETELL_AGENT_ID` -> `retell_agent_id`\n- ElevenLabs: `ELEVENLABS_API_KEY` -> `elevenlabs_api_key`; `ELEVENLABS_AGENT_ID` -> `elevenlabs_agent_id`\n\n### Provider config\n\nUse existing provider config when possible: Vapi assistant, Retell agent, ElevenLabs agent, Bland pathway, or LiveKit agent. Bland uniquely supports inline config \u2014 `platform` may use `bland_pathway_id`, `persona_id`, or an inline `task` (with optional voice, model, and turn-handling overrides; see Bland\'s API docs for the full field list).\n\n### Concurrency\n\nWhen you fan out multiple Vent calls in parallel against the same provider (for example, running several named calls from one suite at once with `&` and `wait`), respect the provider\'s per-account concurrency limit. Exceeding it makes calls queue or fail at the provider \u2014 Vent does not enforce these caps for you.\n\nRecord the limit as `max_concurrency` in the suite\'s `platform` block so it\'s visible on future runs. Ask the user which plan they\'re on if sizing matters; otherwise use the conservative default in bold.\n\n- **Vapi**: **10** included per account; reserved lines can be purchased self-serve; Enterprise is unlimited.\n- **Retell**: Pay-as-you-go includes **20**; Enterprise has no cap.\n- **Bland**: Start=**10**, Build=50, Scale=100, Enterprise=unlimited.\n- **ElevenLabs**: Free=**4**, Starter=6, Creator=10, Pro=20, Scale=30, Business=30. Burst pricing can temporarily allow up to 3x base.\n- **LiveKit Cloud**: Build=**5**, Ship=20, Scale=50 managed inference sessions (the usual gate for voice agents); agent-session concurrency can go higher (Scale up to 600).\n\n## WebSocket\n\nFor `adapter: "websocket"`, Vent sends binary 16-bit mono PCM audio over one websocket connection. Websocket text frames are optional JSON events. Audio-only websocket agents still work, but events improve turn detection and observability. Vent sends `{"type":"end-call"}` when the test is done.\n\nUseful websocket text frames:\n\n```jsonc\n{"type":"speech-update","status":"started"}\n{"type":"speech-update","status":"stopped"}\n{"type":"tool_call","name":"check_availability","arguments":{},"result":{},"successful":true,"duration_ms":150}\n{"type":"vent:timing","stt_ms":120,"llm_ms":450,"tts_ms":80}\n{"type":"vent:session","platform":"custom","provider_call_id":"call_123","provider_session_id":"session_abc"}\n{"type":"vent:call-metadata","call_metadata":{"recording_url":"https://...","cost_usd":0.12}}\n{"type":"vent:transcript","role":"caller","text":"I need to reschedule","turn_index":0}\n{"type":"vent:transfer","destination":"+15551234567","status":"attempted"}\n{"type":"vent:debug-url","label":"trace","url":"https://..."}\n{"type":"vent:warning","message":"provider warning","code":"provider_warning"}\n```\n\n`vent:session-report` is **not** handled by the websocket adapter \u2014 it\'s only consumed by the LiveKit helper. Do not emit it from a websocket agent.\n\nPlatform adapters capture tool calls automatically. Websocket agents must emit `tool_call` frames for tool observability. Platform adapters get component latency automatically. Websocket agents should emit `vent:timing` after each agent response when STT/LLM/TTS breakdown is available.\n\n## LiveKit\n\nBefore running LiveKit tests, install and add the Vent helper to the LiveKit agent entrypoint. Node: `npm install @vent-hq/livekit`, then call `instrumentLiveKitAgent({ ctx, session })`. Python: `pip install vent-livekit`, then call `instrument_livekit_agent(ctx=ctx, session=session)`.\n\nLiveKit direct mode requires the LiveKit Agents SDK. Custom LiveKit participants should use the websocket adapter with a relay. If the LiveKit agent registered with an explicit dispatch name, set `livekit_agent_name` in `platform`.\n\nLiveKit parallel calls are supported, but capacity depends on where the user runs their agent. LiveKit Cloud only routes audio \u2014 the agent code runs on the user\'s own infrastructure (laptop, Fly.io, Railway, k8s, etc.), and that box\'s CPU/RAM is the real bottleneck. Before firing parallel calls, derive `platform.max_concurrency` from the minimum of: (1) the user\'s LiveKit plan limit (Build=5, Ship=20, Scale=50+ concurrent agent sessions), (2) their agent worker box capacity (rough rule: 1c/1GB\u21921\u20133 jobs, 2c/4GB\u21925\u201310, 4c/8GB\u219210\u201325, 8c/16GB\u219225\u201350; multiply by number of worker boxes), and (3) their STT/TTS provider concurrency (e.g. Deepgram TTS streaming = 2). If the user wants more parallel calls than that minimum allows, push back \u2014 explain which limit is the bottleneck (plan / box / provider) and either reduce the count or suggest scaling that limit. Architecture: register one `agent_name` and run N workers under it (one big worker, or several smaller ones \u2014 same name); LiveKit\'s dispatcher round-robins jobs across them with built-in failover. Don\'t use numbered names (`agent-0`, `agent-1`) \u2014 that reinvents what the dispatcher does and breaks failover.\n\nUse the LiveKit helper for observability; do not publish `vent:*` topics manually. Do not hand-roll `vent:session-report` from `ctx.addShutdownCallback`; after `room.disconnect()` it can fail with `engine is closed`. The helper captures SDK metrics, tool events, conversation items, usage, and close events. Native LiveKit `lk.transcription` and `lk.agent.state` provide transcript and agent-state timing.\n\n## Output\n\n### Live result\n\n`npx vent-hq run` returns a single JSON result on stdout in non-TTY mode (not an SSE JSONL stream). Exit codes: `0` = call ran through the pipeline; `1` = pipeline-level failure; `2` = harness error.\n\nMost result fields are always present; `latency`, `component_latency`, `call_metadata`, and `emotion` may be `null` when the underlying analysis didn\'t run; `debug` is absent without `--verbose`. Branch on null before reading nested fields. Use `--verbose` only when the default doesn\'t explain a failure \u2014 when you need `platform_transcript` (to check Vent\'s STT), per-turn or component-level latency breakdowns, the raw tool-call timeline, or provider-native artifacts in `debug.provider_metadata`. Otherwise skip \u2014 it just adds noise.\n\nVent\'s transcript is ground truth. Judge on semantic intent: ignore homophones and minor mis-hears (`"check teach hat"` for `"check that"`, missing question marks on short tails) \u2014 those are streaming-STT noise on Vent\'s caller side, not agent bugs, and **don\'t surface them in the report** (they\'re Vent-side artifacts, not actionable for the user). But clear gibberish or word-soup (e.g. `"Cristoxin"` where the agent should have said `"Of course, talk soon"`) is **not** a Vent artifact \u2014 Vent\'s STT does not invent words like that. It means the platform\'s TTS produced corrupted audio or the agent\'s STT/LLM generated the wrong text, and the fix lives there (TTS voice config, agent prompt, model temperature, codec issue). Never dismiss the run as a "Vent harness STT" issue; iterate on the agent or flag the platform.\n\nFor transfers: `call_metadata.transfer_attempted` (provider claimed) and `transfer_completed` (Vent-verified) can disagree \u2014 report both. `transfers[]` carries destination, type, and per-attempt status.\n\n### Saved history\n\nAfter every run, Vent writes the full result JSON to `.vent/runs/`. Shape:\n\n```jsonc\n{\n "run_id": "...",\n "timestamp": "2026-04-21T...Z",\n "git_sha": "...",\n "summary": { "calls_total": 2, "total_duration_ms": 12345, "total_cost_usd": 0.01 },\n "call_results": [\n { "name": "happy-path", "status": "completed", "duration_ms": 6123, "transcript": [], "observed_tool_calls": [], "latency": { "response_time_ms": 420, "p95_response_time_ms": 980 }, "call_metadata": { "cost_usd": 0.004 } }\n ]\n}\n```\n\nWhen comparing against a prior run (Workflow step 6), inspect:\n\n- Run-completion status flips: `call_results[i].status` (pipeline-only \u2014 judge mission success from the transcript)\n- Latency: `call_results[i].latency.response_time_ms` (mean) or `latency.p95_response_time_ms` increased >20%\n- Tool calls: count of `call_results[i].observed_tool_calls[].successful` dropped\n- Cost: `summary.total_cost_usd` or `call_results[i].call_metadata.cost_usd` increased >30%\n- Transcript: `call_results[i].transcript` diverged in semantic content (ignore STT noise)\n\n## Reporting Results\n\nBefore reporting, read the agent\'s code to locate where the observed behavior originates. If the issue is small and you can fix it, fix it and explain what you did \u2014 don\'t ask permission first.\n\nAdapt the report shape to the call \u2014 a clean pass needs little, a regression with a multi-layer cause needs more. Use a transcript excerpt when it helps the user see what happened.\n\nHard rules:\n\n- Pair raw numbers with their plain-English meaning \u2014 don\'t drop the number, but don\'t leave it unexplained. E.g. "p95 latency was 850ms, which is snappy and well within natural conversational pacing" or "p95 hit 1.6 seconds with the LLM as the bottleneck \u2014 noticeably sluggish to a caller."\n- Name the user\'s voice agent by platform on first mention (e.g. "the Vapi agent responded snappily throughout") so the user knows immediately which agent the observation is about. After that, just say "the agent" \u2014 don\'t repeat the platform name on every line.\n- Always include the recording from `call_metadata.recording_url` as an inline `[Recording](url)` link, placed in **one block at the very end of the report** \u2014 never sprinkled through the prose. Single call: one link as the last line. Multi-call: one labeled link per call (e.g. `reschedule-appointment: [Recording](url)`). Never paste a bare URL.\n- Mission success is your judgment, not Vent\'s. The per-call `status` is only `"completed"` (pipeline ran) or `"error"` (pipeline failed); decide whether the agent actually accomplished the scenario from the transcript and tool calls.\n- Similar-sounding word substitutions (e.g. "ocean" for "OSHA") are STT ambiguity, not comprehension failure. The fix lives in STT keyword hints, custom vocabulary, or a prompt-level term list \u2014 not the agent\'s reasoning.\n- Surface only what the user can act on in their own agent\'s code or config \u2014 never `warnings[]` (infrastructure noise), Vent-side artifacts (caller wait modes, harness timing, internal pipeline quirks), or `cost_usd` unless asked.\n\nFor multi-call runs, lead with your own judgment of what happened across the calls (e.g. "3 of 4 did what they were supposed to; `cancel-appointment` never actually canceled"), not a parroted pass/fail count. Then cover each call with whatever depth it needs.\n';
|
|
5943
5868
|
|
|
5944
5869
|
// src/skills/codex.md
|
|
5945
|
-
var codex_default = '# Vent - Voice Agent Calls\n\n## When to Test\n\nVent is the voice CI gate. Auto-trigger a call after any non-trivial voice-affecting change \u2014 don\'t wait to be asked. Non-trivial means anything that affects what the caller hears, how the agent responds, what tools are called, how the call routes, or audio timing: prompt rewrites or sections (not one-word tweaks), tool schema or handler changes, state/memory logic, transfer logic, provider/audio config, platform integration, and any voice-behavior bug fix. One-line copy or comment edits don\'t need a call.\n\nBefore launching, scan `.vent/runs/` for unresolved issues from prior runs (failing status, regressions, warnings, latency jumps); fix in-scope ones as part of this change and verify they\'re resolved by the new run, surface out-of-scope ones rather than silently leaving them. The task is not done until you\'ve reviewed the transcript, tool calls, and observed behavior and judged that the change works \u2014 Vent\'s `status` and exit code only reflect pipeline completion, not mission success.\n\n## Autonomous Iteration\n\n**Default: iterate.** Run a fix \u2192 Vent-call \u2192 fix \u2192 Vent-call loop whenever a call surfaces a fixable issue. Don\'t report a failure (or a degraded-quality success) back to the user before attempting to fix it \u2014 they want the agent working, not a diagnosis. After each call, if the result reveals a problem you can fix with a concrete plan, apply the fix and run again. Fixable issues include: tool schema or registry gaps, missing handlers, prompt phrasing, **STT vocabulary mismatches** (e.g. domain terms misheard \u2014 Deepgram `keyterms`, custom vocabulary, prompt-level term lists), **latency tunables** (endpointing thresholds, preemptive generation, model selection), and audio quality config. If you\'d be guessing at the next attempt, stop and ask the user instead of trying.\n\n**"Mission success but with quality problems" still requires iteration.** If the call ran end-to-end but had noticeably bad latency, STT mishearing the agent\'s domain vocabulary, audio artifacts, or any other caller-experience degradation that has a concrete fix, the call is *not done* \u2014 apply the fix and re-run. Don\'t downgrade fixable problems to "observations" or "things worth flagging." The pipeline running cleanly is the floor, not the ceiling.\n\n**Never ask permission to iterate.** "Want me to apply this and re-run?" / "Should I fix it?" / "Let me know if you want me to..." \u2014 these are skill violations. If you have a concrete fix, apply it and run the next call. The user opts out by saying so explicitly; silence is not opt-out.\n\nStop iterating and report when one of:\n\n- The next call confirms the fix worked AND no further fixable issues remain (success \u2014 done).\n- The same fix fails twice (your hypothesis is wrong; ask the user).\n- The failure mode shifts between attempts (you\'re chasing a moving target).\n- You can\'t justify the next call against its provider cost \u2014 each call spends real money and provider quota.\n\nIf the user explicitly opts out ("just report", "don\'t iterate", "stop after the first call", "I\'ll fix it myself"), respect that \u2014 run once, report, stop. Otherwise iterate by default. The first time you start an autonomous loop in a session, mention it once so the user knows it\'s happening (e.g. "the call surfaced X \u2014 fixing and re-running; I\'ll report back when it converges or stops making progress").\n\n## Codex Execution\n\nUse a 5-minute shell-tool timeout (`300000` ms) on Vent run commands so normal calls are not killed by the default 2-minute Bash timeout. This is not backgrounding; wait for stdout/results before ending your response. Use the JSON returned by `npx vent-hq run` directly; do not call `vent status` unless checking an older run.\n\nCodex can run shell tool calls concurrently \u2014 for multiple calls from one suite, run each named call as its own parallel shell tool call (do not combine them with `&`):\n\n```bash\nnpx vent-hq run -f .vent/suite.vapi.json --call happy-path\nnpx vent-hq run -f .vent/suite.vapi.json --call tool-path\n```\n\n## Workflow\n\n1. Identify the behavior under test. Read enough of the agent codebase to understand its system prompt, tools, handlers, routes, provider config, platform wiring, and expected handoffs.\n2. Reuse an existing `.vent/suite.<adapter>.json` when possible. If `.vent/` contains multiple suites, inspect `connection.adapter` and report which suite file produced the result.\n3. Create or update a suite only when the existing calls do not cover the changed behavior. Name calls after real flows, for example `reschedule-appointment`, not `call-1`.\n4. If the suite uses `start_command`, start one shared local session first with `npx vent-hq agent start -f .vent/suite.<adapter>.json`, then pass `--session <session-id>` to each run.\n\n **For locally-run LiveKit agents: every run requires killing *all* workers, starting one fresh worker, and waiting a full 60 seconds before submitting.** Unconditional \u2014 LiveKit Cloud round-robins across registered workers, so a single survivor with a dead inference subprocess fails ~N-1 of N calls. Don\'t rely on `pkill -f <path-pattern>`; bare command lines like `node --import tsx agent.ts dev` won\'t match a path filter. Use `ps aux | grep -E "node.*agent\\.ts|@livekit/agents.*ipc"`, `kill -9` by PID, re-run `ps` to confirm zero survivors, then start the fresh worker. Skipping the 60s wait fails with `did not publish audio track`; if that error appears alongside `Error [ERR_IPC_CHANNEL_CLOSED] from InferenceProcExecutor.doInference` in the agent log right after a "running EOU detection" line, that\'s a straggler \u2014 redo the kill sweep. Hosted LiveKit Cloud agents don\'t need any of this; run normally.\n5. Pick which call(s) to run based on the change. Fixed bug: replay the failing scenario. Changed tool: include a call that triggers that tool. Prompt or routing change: include the relevant happy path and any important edge path.\n6. Compare against the previous JSON in `.vent/runs/` when validating a fix or regression. Check status flips, latency jumps, tool-call success drops, cost jumps, and transcript divergence. Correlate with `git diff` between saved `git_sha` values when available; skip if no previous run exists.\n\n## Commands\n\n```bash\nnpx vent-hq init # First-time setup (auth + skill install + starter suite)\nnpx vent-hq login # Log in to existing account\nnpx vent-hq run -f .vent/suite.X.json # Run a single-call suite\nnpx vent-hq run -f .vent/suite.X.json --call NAME # Run one named call from a multi-call suite\nnpx vent-hq run ... --session <session-id> # Add to any run; routes through an existing local relay session\nnpx vent-hq run ... --verbose # Add to any run or status; include verbose debug fields\nnpx vent-hq stop <run-id> # Cancel a queued or running run\nnpx vent-hq status <run-id> # Fetch results for a previous run\nnpx vent-hq agent start -f .vent/suite.X.json # Start a shared local relay session\nnpx vent-hq agent stop <session-id> # Stop a shared local relay session\n```\n\nIf `~/.vent/credentials` is missing and `VENT_ACCESS_TOKEN` is not set, run `npx vent-hq init`. For an existing account, run `npx vent-hq login` or set `VENT_ACCESS_TOKEN`.\n\n## Suite Config\n\nSuites live in `.vent/suite.<adapter>.json`. `connection` is declared once per suite. `calls` is a named map, and each key becomes the call name used with `--call`.\n\nLocal websocket suite:\n\n```json\n{\n "connection": {\n "adapter": "websocket",\n "start_command": "npm run start",\n "health_endpoint": "/health",\n "agent_port": 3001\n },\n "calls": {\n "happy-path": {\n "caller_prompt": "You are Maria calling to reschedule her appointment to next Tuesday.",\n "max_turns": 8,\n "silence_threshold_ms": 1200\n }\n }\n}\n```\n\nPlatform-direct suite:\n\n```json\n{\n "connection": {\n "adapter": "vapi",\n "platform": { "provider": "vapi" }\n },\n "calls": {\n "happy-path": {\n "caller_prompt": "You are Maria calling to reschedule her appointment to next Tuesday.",\n "max_turns": 8\n }\n }\n}\n```\n\nWrite `caller_prompt` as a realistic caller with a name, goal, mood, constraints, and conditional behavior. Set `max_turns` based on flow complexity: FAQ `4-6`, booking or tool use `8-12`, complex flows `12-20`.\n\nCall fields:\n\n- `caller_prompt` and `max_turns` are required.\n- `silence_threshold_ms` must be `200-10000`. Common ranges: FAQ `800-1200`, tool calls `2000-3000`, complex reasoning `3000-5000`.\n- `persona` supports `pace`, `clarity`, `disfluencies`, `cooperation`, `emotion`, `memory`, `intent_clarity`, and `confirmation_style`.\n- `caller_audio` supports noise, speed, speakerphone, mic distance, clarity, accent, packet loss, and jitter.\n- `language` is an ISO 639-1 code such as `en`, `es`, `fr`, `de`, `it`, `nl`, or `ja`.\n- `voice` is `"male"` or `"female"` (English only; default female). Use to flip the caller\'s perceived gender. Ignored if `caller_audio.accent` is set or `language` is non-English.\n- `prosody: true` enables emotion analysis and requires Hume access.\n\n## Connections and Credentials\n\n### Adapter choice\n\nUse `websocket` for your own local or hosted runtime. Use `start_command` for local agents or `agent_url` for hosted custom endpoints. For `start_command` and `agent_url`, do not put Deepgram, ElevenLabs, OpenAI, or other agent runtime keys into Vent config unless the Vent adapter itself needs them \u2014 the tested agent owns its own runtime credentials.\n\nUse `vapi`, `retell`, `elevenlabs`, `bland`, or `livekit` for platform-direct testing. In this mode Vent itself talks to the provider on the user\'s behalf.\n\nVent provides `DEEPGRAM_API_KEY` and `ANTHROPIC_API_KEY` for its hosted caller/evaluation stack \u2014 those are Vent\'s, not the tested agent\'s.\n\n### Credential resolution\n\nIn platform-direct mode the CLI auto-resolves credentials from `.env.local`, `.env`, and the current shell environment. Do not run `source .env && export` before Vent commands. If you include credential fields in JSON, use the actual value, not the env var name. Do not manually author `platform_connection_id`; the CLI creates or updates the saved platform connection automatically.\n\nAuto-resolved env vars and JSON fields:\n\n- Vapi: `VAPI_API_KEY` -> `vapi_api_key`; `VAPI_ASSISTANT_ID` or `VAPI_AGENT_ID` -> `vapi_assistant_id`\n- Bland: `BLAND_API_KEY` -> `bland_api_key`; `BLAND_PATHWAY_ID` -> `bland_pathway_id`; `BLAND_PERSONA_ID` -> `persona_id`\n- LiveKit: `LIVEKIT_API_KEY` -> `livekit_api_key`; `LIVEKIT_API_SECRET` -> `livekit_api_secret`; `LIVEKIT_URL` -> `livekit_url`\n- Retell: `RETELL_API_KEY` -> `retell_api_key`; `RETELL_AGENT_ID` -> `retell_agent_id`\n- ElevenLabs: `ELEVENLABS_API_KEY` -> `elevenlabs_api_key`; `ELEVENLABS_AGENT_ID` -> `elevenlabs_agent_id`\n\n### Provider config\n\nUse existing provider config when possible: Vapi assistant, Retell agent, ElevenLabs agent, Bland pathway, or LiveKit agent. Bland uniquely supports inline config \u2014 `platform` may use `bland_pathway_id`, `persona_id`, or an inline `task` (with optional voice, model, and turn-handling overrides; see Bland\'s API docs for the full field list).\n\n### Concurrency\n\nWhen you fan out multiple Vent calls in parallel against the same provider (for example, running several named calls from one suite at once), respect the provider\'s per-account concurrency limit. Exceeding it makes calls queue or fail at the provider \u2014 Vent does not enforce these caps for you.\n\nRecord the limit as `max_concurrency` in the suite\'s `platform` block so it\'s visible on future runs. Ask the user which plan they\'re on if sizing matters; otherwise use the conservative default in bold.\n\n- **Vapi**: **10** included per account; reserved lines can be purchased self-serve; Enterprise is unlimited.\n- **Retell**: Pay-as-you-go includes **20**; Enterprise has no cap.\n- **Bland**: Start=**10**, Build=50, Scale=100, Enterprise=unlimited.\n- **ElevenLabs**: Free=**4**, Starter=6, Creator=10, Pro=20, Scale=30, Business=30. Burst pricing can temporarily allow up to 3x base.\n- **LiveKit Cloud**: Build=**5**, Ship=20, Scale=50 managed inference sessions (the usual gate for voice agents); agent-session concurrency can go higher (Scale up to 600).\n\n## WebSocket\n\nFor `adapter: "websocket"`, Vent sends binary 16-bit mono PCM audio over one websocket connection. Websocket text frames are optional JSON events. Audio-only websocket agents still work, but events improve turn detection and observability. Vent sends `{"type":"end-call"}` when the test is done.\n\nUseful websocket text frames:\n\n```jsonc\n{"type":"speech-update","status":"started"}\n{"type":"speech-update","status":"stopped"}\n{"type":"tool_call","name":"check_availability","arguments":{},"result":{},"successful":true,"duration_ms":150}\n{"type":"vent:timing","stt_ms":120,"llm_ms":450,"tts_ms":80}\n{"type":"vent:session","platform":"custom","provider_call_id":"call_123","provider_session_id":"session_abc"}\n{"type":"vent:call-metadata","call_metadata":{"recording_url":"https://...","cost_usd":0.12}}\n{"type":"vent:transcript","role":"caller","text":"I need to reschedule","turn_index":0}\n{"type":"vent:transfer","destination":"+15551234567","status":"attempted"}\n{"type":"vent:debug-url","label":"trace","url":"https://..."}\n{"type":"vent:warning","message":"provider warning","code":"provider_warning"}\n```\n\n`vent:session-report` is **not** handled by the websocket adapter \u2014 it\'s only consumed by the LiveKit helper. Do not emit it from a websocket agent.\n\nPlatform adapters capture tool calls automatically. Websocket agents must emit `tool_call` frames for tool observability. Platform adapters get component latency automatically. Websocket agents should emit `vent:timing` after each agent response when STT/LLM/TTS breakdown is available.\n\n## LiveKit\n\nBefore running LiveKit tests, install and add the Vent helper to the LiveKit agent entrypoint. Node: `npm install @vent-hq/livekit`, then call `instrumentLiveKitAgent({ ctx, session })`. Python: `pip install vent-livekit`, then call `instrument_livekit_agent(ctx=ctx, session=session)`.\n\nLiveKit direct mode requires the LiveKit Agents SDK. Custom LiveKit participants should use the websocket adapter with a relay. If the LiveKit agent registered with an explicit dispatch name, set `livekit_agent_name` in `platform`.\n\nLiveKit does not support multiple concurrent Vent calls against one agent process yet. Run LiveKit calls sequentially unless you intentionally start separate agent worker processes and route each call to its own process. For Node agents, that means separate Node.js processes. Do not treat parallel calls against a single LiveKit worker as a valid concurrency test until multi-call support is engineered.\n\nUse the LiveKit helper for observability; do not publish `vent:*` topics manually. Do not hand-roll `vent:session-report` from `ctx.addShutdownCallback`; after `room.disconnect()` it can fail with `engine is closed`. The helper captures SDK metrics, tool events, conversation items, usage, and close events. Native LiveKit `lk.transcription` and `lk.agent.state` provide transcript and agent-state timing.\n\n## Output\n\n### Live result\n\n`npx vent-hq run` returns a single JSON result on stdout in non-TTY mode (not an SSE JSONL stream). Exit codes: `0` = call ran through the pipeline; `1` = pipeline-level failure; `2` = harness error.\n\nMost result fields are always present; `latency`, `component_latency`, `call_metadata`, and `emotion` may be `null` when the underlying analysis didn\'t run; `debug` is absent without `--verbose`. Branch on null before reading nested fields. Use `--verbose` only when the default doesn\'t explain a failure \u2014 when you need `platform_transcript` (to check Vent\'s STT), per-turn or component-level latency breakdowns, the raw tool-call timeline, or provider-native artifacts in `debug.provider_metadata`. Otherwise skip \u2014 it just adds noise.\n\nVent\'s transcript is ground truth. Judge on semantic intent: ignore homophones and minor mis-hears (`"check teach hat"` for `"check that"`, missing question marks on short tails) \u2014 those are streaming-STT noise on Vent\'s caller side, not agent bugs, and **don\'t surface them in the report** (they\'re Vent-side artifacts, not actionable for the user). But clear gibberish or word-soup (e.g. `"Cristoxin"` where the agent should have said `"Of course, talk soon"`) is **not** a Vent artifact \u2014 Vent\'s STT does not invent words like that. It means the platform\'s TTS produced corrupted audio or the agent\'s STT/LLM generated the wrong text, and the fix lives there (TTS voice config, agent prompt, model temperature, codec issue). Never dismiss the run as a "Vent harness STT" issue; iterate on the agent or flag the platform.\n\nFor transfers: `call_metadata.transfer_attempted` (provider claimed) and `transfer_completed` (Vent-verified) can disagree \u2014 report both. `transfers[]` carries destination, type, and per-attempt status.\n\n### Saved history\n\nAfter every run, Vent writes the full result JSON to `.vent/runs/`. Shape:\n\n```jsonc\n{\n "run_id": "...",\n "timestamp": "2026-04-21T...Z",\n "git_sha": "...",\n "summary": { "calls_total": 2, "total_duration_ms": 12345, "total_cost_usd": 0.01 },\n "call_results": [\n { "name": "happy-path", "status": "completed", "duration_ms": 6123, "transcript": [], "observed_tool_calls": [], "latency": { "response_time_ms": 420, "p95_response_time_ms": 980 }, "call_metadata": { "cost_usd": 0.004 } }\n ]\n}\n```\n\nWhen comparing against a prior run (Workflow step 6), inspect:\n\n- Run-completion status flips: `call_results[i].status` (pipeline-only \u2014 judge mission success from the transcript)\n- Latency: `call_results[i].latency.response_time_ms` (mean) or `latency.p95_response_time_ms` increased >20%\n- Tool calls: count of `call_results[i].observed_tool_calls[].successful` dropped\n- Cost: `summary.total_cost_usd` or `call_results[i].call_metadata.cost_usd` increased >30%\n- Transcript: `call_results[i].transcript` diverged in semantic content (ignore STT noise)\n\n## Reporting Results\n\nBefore reporting, read the agent\'s code to locate where the observed behavior originates. If the issue is small and you can fix it, fix it and explain what you did \u2014 don\'t ask permission first.\n\nAdapt the report shape to the call \u2014 a clean pass needs little, a regression with a multi-layer cause needs more. Use a transcript excerpt when it helps the user see what happened.\n\nHard rules:\n\n- Pair raw numbers with their plain-English meaning \u2014 don\'t drop the number, but don\'t leave it unexplained. E.g. "p95 latency was 850ms, which is snappy and well within natural conversational pacing" or "p95 hit 1.6 seconds with the LLM as the bottleneck \u2014 noticeably sluggish to a caller."\n- Name the user\'s voice agent by platform on first mention (e.g. "the Vapi agent responded snappily throughout") so the user knows immediately which agent the observation is about. After that, just say "the agent" \u2014 don\'t repeat the platform name on every line.\n- Always include the recording from `call_metadata.recording_url` as an inline `[Recording](url)` link, placed in **one block at the very end of the report** \u2014 never sprinkled through the prose. Single call: one link as the last line. Multi-call: one labeled link per call (e.g. `reschedule-appointment: [Recording](url)`). Never paste a bare URL.\n- Mission success is your judgment, not Vent\'s. The per-call `status` is only `"completed"` (pipeline ran) or `"error"` (pipeline failed); decide whether the agent actually accomplished the scenario from the transcript and tool calls.\n- Similar-sounding word substitutions (e.g. "ocean" for "OSHA") are STT ambiguity, not comprehension failure. The fix lives in STT keyword hints, custom vocabulary, or a prompt-level term list \u2014 not the agent\'s reasoning.\n- Surface only what the user can act on in their own agent\'s code or config \u2014 never `warnings[]` (infrastructure noise), Vent-side artifacts (caller wait modes, harness timing, internal pipeline quirks), or `cost_usd` unless asked.\n\nFor multi-call runs, lead with your own judgment of what happened across the calls (e.g. "3 of 4 did what they were supposed to; `cancel-appointment` never actually canceled"), not a parroted pass/fail count. Then cover each call with whatever depth it needs.\n';
|
|
5870
|
+
var codex_default = '# Vent - Voice Agent Calls\n\n## When to Test\n\nVent is the voice CI gate. Auto-trigger a call after any non-trivial voice-affecting change \u2014 don\'t wait to be asked. Non-trivial means anything that affects what the caller hears, how the agent responds, what tools are called, how the call routes, or audio timing: prompt rewrites or sections (not one-word tweaks), tool schema or handler changes, state/memory logic, transfer logic, provider/audio config, platform integration, and any voice-behavior bug fix. One-line copy or comment edits don\'t need a call.\n\nBefore launching, scan `.vent/runs/` for unresolved issues from prior runs (failing status, regressions, warnings, latency jumps); fix in-scope ones as part of this change and verify they\'re resolved by the new run, surface out-of-scope ones rather than silently leaving them. The task is not done until you\'ve reviewed the transcript, tool calls, and observed behavior and judged that the change works \u2014 Vent\'s `status` and exit code only reflect pipeline completion, not mission success.\n\n## Autonomous Iteration\n\n**Default: iterate.** Run a fix \u2192 Vent-call \u2192 fix \u2192 Vent-call loop whenever a call surfaces a fixable issue. Don\'t report a failure (or a degraded-quality success) back to the user before attempting to fix it \u2014 they want the agent working, not a diagnosis. After each call, if the result reveals a problem you can fix with a concrete plan, apply the fix and run again. Fixable issues include: tool schema or registry gaps, missing handlers, prompt phrasing, **STT vocabulary mismatches** (e.g. domain terms misheard \u2014 Deepgram `keyterms`, custom vocabulary, prompt-level term lists), **latency tunables** (endpointing thresholds, preemptive generation, model selection), and audio quality config. If you\'d be guessing at the next attempt, stop and ask the user instead of trying.\n\n**"Mission success but with quality problems" still requires iteration.** If the call ran end-to-end but had noticeably bad latency, STT mishearing the agent\'s domain vocabulary, audio artifacts, or any other caller-experience degradation that has a concrete fix, the call is *not done* \u2014 apply the fix and re-run. Don\'t downgrade fixable problems to "observations" or "things worth flagging." The pipeline running cleanly is the floor, not the ceiling.\n\n**Never ask permission to iterate.** "Want me to apply this and re-run?" / "Should I fix it?" / "Let me know if you want me to..." \u2014 these are skill violations. If you have a concrete fix, apply it and run the next call. The user opts out by saying so explicitly; silence is not opt-out.\n\nStop iterating and report when one of:\n\n- The next call confirms the fix worked AND no further fixable issues remain (success \u2014 done).\n- The same fix fails twice (your hypothesis is wrong; ask the user).\n- The failure mode shifts between attempts (you\'re chasing a moving target).\n- You can\'t justify the next call against its provider cost \u2014 each call spends real money and provider quota.\n\nIf the user explicitly opts out ("just report", "don\'t iterate", "stop after the first call", "I\'ll fix it myself"), respect that \u2014 run once, report, stop. Otherwise iterate by default. The first time you start an autonomous loop in a session, mention it once so the user knows it\'s happening (e.g. "the call surfaced X \u2014 fixing and re-running; I\'ll report back when it converges or stops making progress").\n\n## Codex Execution\n\nUse a 5-minute shell-tool timeout (`300000` ms) on Vent run commands so normal calls are not killed by the default 2-minute Bash timeout. This is not backgrounding; wait for stdout/results before ending your response. Use the JSON returned by `npx vent-hq run` directly.\n\nCodex can run shell tool calls concurrently \u2014 for multiple calls from one suite, run each named call as its own parallel shell tool call (do not combine them with `&`):\n\n```bash\nnpx vent-hq run -f .vent/suite.vapi.json --call happy-path\nnpx vent-hq run -f .vent/suite.vapi.json --call tool-path\n```\n\n## Workflow\n\n1. Identify the behavior under test. Read enough of the agent codebase to understand its system prompt, tools, handlers, routes, provider config, platform wiring, and expected handoffs.\n2. Reuse an existing `.vent/suite.<adapter>.json` when possible. If `.vent/` contains multiple suites, inspect `connection.adapter` and report which suite file produced the result.\n3. Create or update a suite only when the existing calls do not cover the changed behavior. Name calls after real flows, for example `reschedule-appointment`, not `call-1`.\n4. If the suite uses `start_command`, start one shared local session first with `npx vent-hq agent start -f .vent/suite.<adapter>.json`, then pass `--session <session-id>` to each run.\n\n **For locally-run LiveKit agents: every run requires killing *all* workers, starting one fresh worker, and waiting a full 60 seconds before submitting.** Unconditional \u2014 LiveKit Cloud round-robins across registered workers, so a single survivor with a dead inference subprocess fails ~N-1 of N calls. Don\'t rely on `pkill -f <path-pattern>`; bare command lines like `node --import tsx agent.ts dev` won\'t match a path filter. Use `ps aux | grep -E "node.*agent\\.ts|@livekit/agents.*ipc"`, `kill -9` by PID, re-run `ps` to confirm zero survivors, then start the fresh worker. Skipping the 60s wait fails with `did not publish audio track`; if that error appears alongside `Error [ERR_IPC_CHANNEL_CLOSED] from InferenceProcExecutor.doInference` in the agent log right after a "running EOU detection" line, that\'s a straggler \u2014 redo the kill sweep. Hosted LiveKit Cloud agents don\'t need any of this; run normally.\n5. Pick which call(s) to run based on the change. Fixed bug: replay the failing scenario. Changed tool: include a call that triggers that tool. Prompt or routing change: include the relevant happy path and any important edge path.\n6. Compare against the previous JSON in `.vent/runs/` when validating a fix or regression. Check status flips, latency jumps, tool-call success drops, cost jumps, and transcript divergence. Correlate with `git diff` between saved `git_sha` values when available; skip if no previous run exists.\n\n## Commands\n\n```bash\nnpx vent-hq init # First-time setup (auth + skill install + starter suite)\nnpx vent-hq login # Log in to existing account\nnpx vent-hq login --status # Print whether credentials are present\nnpx vent-hq logout # Remove saved credentials from ~/.vent/credentials\nnpx vent-hq run -f .vent/suite.X.json # Run a single-call suite\nnpx vent-hq run -f .vent/suite.X.json --call NAME # Run one named call from a multi-call suite\nnpx vent-hq run ... --session <session-id> # Add to any run; routes through an existing local relay session\nnpx vent-hq run ... --verbose # Add to any run; include verbose debug fields\nnpx vent-hq stop <run-id> # Cancel a queued or running run\nnpx vent-hq agent start -f .vent/suite.X.json # Start a shared local relay session\nnpx vent-hq agent stop <session-id> # Stop a shared local relay session\n```\n\nIf `~/.vent/credentials` is missing and `VENT_ACCESS_TOKEN` is not set, run `npx vent-hq init`. For an existing account, run `npx vent-hq login` or set `VENT_ACCESS_TOKEN`.\n\n## Suite Config\n\nSuites live in `.vent/suite.<adapter>.json`. `connection` is declared once per suite. `calls` is a named map, and each key becomes the call name used with `--call`.\n\nLocal websocket suite:\n\n```json\n{\n "connection": {\n "adapter": "websocket",\n "start_command": "npm run start",\n "health_endpoint": "/health",\n "agent_port": 3001\n },\n "calls": {\n "happy-path": {\n "caller_prompt": "You are Maria calling to reschedule her appointment to next Tuesday.",\n "max_turns": 8,\n "silence_threshold_ms": 1200\n }\n }\n}\n```\n\nPlatform-direct suite:\n\n```json\n{\n "connection": {\n "adapter": "vapi",\n "platform": { "provider": "vapi" }\n },\n "calls": {\n "happy-path": {\n "caller_prompt": "You are Maria calling to reschedule her appointment to next Tuesday.",\n "max_turns": 8\n }\n }\n}\n```\n\nWrite `caller_prompt` as a realistic caller with a name, goal, mood, constraints, and conditional behavior. Set `max_turns` based on flow complexity: FAQ `4-6`, booking or tool use `8-12`, complex flows `12-20`.\n\nCall fields:\n\n- `caller_prompt` and `max_turns` are required.\n- `silence_threshold_ms` must be `200-10000`. Common ranges: FAQ `800-1200`, tool calls `2000-3000`, complex reasoning `3000-5000`.\n- `persona` supports `pace`, `clarity`, `disfluencies`, `cooperation`, `emotion`, `memory`, `intent_clarity`, and `confirmation_style`.\n- `caller_audio` supports noise, speed, speakerphone, mic distance, clarity, accent, packet loss, and jitter.\n- `language` is an ISO 639-1 code such as `en`, `es`, `fr`, `de`, `it`, `nl`, or `ja`.\n- `voice` is `"male"` or `"female"` (English only; default female). Use to flip the caller\'s perceived gender. Ignored if `caller_audio.accent` is set or `language` is non-English.\n\n## Connections and Credentials\n\n### Adapter choice\n\nUse `websocket` for your own local or hosted runtime. Use `start_command` for local agents or `agent_url` for hosted custom endpoints. For `start_command` and `agent_url`, do not put Deepgram, ElevenLabs, OpenAI, or other agent runtime keys into Vent config unless the Vent adapter itself needs them \u2014 the tested agent owns its own runtime credentials.\n\nUse `vapi`, `retell`, `elevenlabs`, `bland`, or `livekit` for platform-direct testing. In this mode Vent itself talks to the provider on the user\'s behalf.\n\nVent provides `DEEPGRAM_API_KEY` and `ANTHROPIC_API_KEY` for its hosted caller/evaluation stack \u2014 those are Vent\'s, not the tested agent\'s.\n\n### Credential resolution\n\nIn platform-direct mode the CLI auto-resolves credentials from `.env.local`, `.env`, and the current shell environment. Do not run `source .env && export` before Vent commands. If you include credential fields in JSON, use the actual value, not the env var name. Do not manually author `platform_connection_id`; the CLI creates or updates the saved platform connection automatically.\n\nAuto-resolved env vars and JSON fields:\n\n- Vapi: `VAPI_API_KEY` -> `vapi_api_key`; `VAPI_ASSISTANT_ID` or `VAPI_AGENT_ID` -> `vapi_assistant_id`\n- Bland: `BLAND_API_KEY` -> `bland_api_key`; `BLAND_PATHWAY_ID` -> `bland_pathway_id`; `BLAND_PERSONA_ID` -> `persona_id`\n- LiveKit: `LIVEKIT_API_KEY` -> `livekit_api_key`; `LIVEKIT_API_SECRET` -> `livekit_api_secret`; `LIVEKIT_URL` -> `livekit_url`\n- Retell: `RETELL_API_KEY` -> `retell_api_key`; `RETELL_AGENT_ID` -> `retell_agent_id`\n- ElevenLabs: `ELEVENLABS_API_KEY` -> `elevenlabs_api_key`; `ELEVENLABS_AGENT_ID` -> `elevenlabs_agent_id`\n\n### Provider config\n\nUse existing provider config when possible: Vapi assistant, Retell agent, ElevenLabs agent, Bland pathway, or LiveKit agent. Bland uniquely supports inline config \u2014 `platform` may use `bland_pathway_id`, `persona_id`, or an inline `task` (with optional voice, model, and turn-handling overrides; see Bland\'s API docs for the full field list).\n\n### Concurrency\n\nWhen you fan out multiple Vent calls in parallel against the same provider (for example, running several named calls from one suite at once), respect the provider\'s per-account concurrency limit. Exceeding it makes calls queue or fail at the provider \u2014 Vent does not enforce these caps for you.\n\nRecord the limit as `max_concurrency` in the suite\'s `platform` block so it\'s visible on future runs. Ask the user which plan they\'re on if sizing matters; otherwise use the conservative default in bold.\n\n- **Vapi**: **10** included per account; reserved lines can be purchased self-serve; Enterprise is unlimited.\n- **Retell**: Pay-as-you-go includes **20**; Enterprise has no cap.\n- **Bland**: Start=**10**, Build=50, Scale=100, Enterprise=unlimited.\n- **ElevenLabs**: Free=**4**, Starter=6, Creator=10, Pro=20, Scale=30, Business=30. Burst pricing can temporarily allow up to 3x base.\n- **LiveKit Cloud**: Build=**5**, Ship=20, Scale=50 managed inference sessions (the usual gate for voice agents); agent-session concurrency can go higher (Scale up to 600).\n\n## WebSocket\n\nFor `adapter: "websocket"`, Vent sends binary 16-bit mono PCM audio over one websocket connection. Websocket text frames are optional JSON events. Audio-only websocket agents still work, but events improve turn detection and observability. Vent sends `{"type":"end-call"}` when the test is done.\n\nUseful websocket text frames:\n\n```jsonc\n{"type":"speech-update","status":"started"}\n{"type":"speech-update","status":"stopped"}\n{"type":"tool_call","name":"check_availability","arguments":{},"result":{},"successful":true,"duration_ms":150}\n{"type":"vent:timing","stt_ms":120,"llm_ms":450,"tts_ms":80}\n{"type":"vent:session","platform":"custom","provider_call_id":"call_123","provider_session_id":"session_abc"}\n{"type":"vent:call-metadata","call_metadata":{"recording_url":"https://...","cost_usd":0.12}}\n{"type":"vent:transcript","role":"caller","text":"I need to reschedule","turn_index":0}\n{"type":"vent:transfer","destination":"+15551234567","status":"attempted"}\n{"type":"vent:debug-url","label":"trace","url":"https://..."}\n{"type":"vent:warning","message":"provider warning","code":"provider_warning"}\n```\n\n`vent:session-report` is **not** handled by the websocket adapter \u2014 it\'s only consumed by the LiveKit helper. Do not emit it from a websocket agent.\n\nPlatform adapters capture tool calls automatically. Websocket agents must emit `tool_call` frames for tool observability. Platform adapters get component latency automatically. Websocket agents should emit `vent:timing` after each agent response when STT/LLM/TTS breakdown is available.\n\n## LiveKit\n\nBefore running LiveKit tests, install and add the Vent helper to the LiveKit agent entrypoint. Node: `npm install @vent-hq/livekit`, then call `instrumentLiveKitAgent({ ctx, session })`. Python: `pip install vent-livekit`, then call `instrument_livekit_agent(ctx=ctx, session=session)`.\n\nLiveKit direct mode requires the LiveKit Agents SDK. Custom LiveKit participants should use the websocket adapter with a relay. If the LiveKit agent registered with an explicit dispatch name, set `livekit_agent_name` in `platform`.\n\nLiveKit parallel calls are supported, but capacity depends on where the user runs their agent. LiveKit Cloud only routes audio \u2014 the agent code runs on the user\'s own infrastructure (laptop, Fly.io, Railway, k8s, etc.), and that box\'s CPU/RAM is the real bottleneck. Before firing parallel calls, derive `platform.max_concurrency` from the minimum of: (1) the user\'s LiveKit plan limit (Build=5, Ship=20, Scale=50+ concurrent agent sessions), (2) their agent worker box capacity (rough rule: 1c/1GB\u21921\u20133 jobs, 2c/4GB\u21925\u201310, 4c/8GB\u219210\u201325, 8c/16GB\u219225\u201350; multiply by number of worker boxes), and (3) their STT/TTS provider concurrency (e.g. Deepgram TTS streaming = 2). If the user wants more parallel calls than that minimum allows, push back \u2014 explain which limit is the bottleneck (plan / box / provider) and either reduce the count or suggest scaling that limit. Architecture: register one `agent_name` and run N workers under it (one big worker, or several smaller ones \u2014 same name); LiveKit\'s dispatcher round-robins jobs across them with built-in failover. Don\'t use numbered names (`agent-0`, `agent-1`) \u2014 that reinvents what the dispatcher does and breaks failover.\n\nUse the LiveKit helper for observability; do not publish `vent:*` topics manually. Do not hand-roll `vent:session-report` from `ctx.addShutdownCallback`; after `room.disconnect()` it can fail with `engine is closed`. The helper captures SDK metrics, tool events, conversation items, usage, and close events. Native LiveKit `lk.transcription` and `lk.agent.state` provide transcript and agent-state timing.\n\n## Output\n\n### Live result\n\n`npx vent-hq run` returns a single JSON result on stdout in non-TTY mode (not an SSE JSONL stream). Exit codes: `0` = call ran through the pipeline; `1` = pipeline-level failure; `2` = harness error.\n\nMost result fields are always present; `latency`, `component_latency`, `call_metadata`, and `emotion` may be `null` when the underlying analysis didn\'t run; `debug` is absent without `--verbose`. Branch on null before reading nested fields. Use `--verbose` only when the default doesn\'t explain a failure \u2014 when you need `platform_transcript` (to check Vent\'s STT), per-turn or component-level latency breakdowns, the raw tool-call timeline, or provider-native artifacts in `debug.provider_metadata`. Otherwise skip \u2014 it just adds noise.\n\nVent\'s transcript is ground truth. Judge on semantic intent: ignore homophones and minor mis-hears (`"check teach hat"` for `"check that"`, missing question marks on short tails) \u2014 those are streaming-STT noise on Vent\'s caller side, not agent bugs, and **don\'t surface them in the report** (they\'re Vent-side artifacts, not actionable for the user). But clear gibberish or word-soup (e.g. `"Cristoxin"` where the agent should have said `"Of course, talk soon"`) is **not** a Vent artifact \u2014 Vent\'s STT does not invent words like that. It means the platform\'s TTS produced corrupted audio or the agent\'s STT/LLM generated the wrong text, and the fix lives there (TTS voice config, agent prompt, model temperature, codec issue). Never dismiss the run as a "Vent harness STT" issue; iterate on the agent or flag the platform.\n\nFor transfers: `call_metadata.transfer_attempted` (provider claimed) and `transfer_completed` (Vent-verified) can disagree \u2014 report both. `transfers[]` carries destination, type, and per-attempt status.\n\n### Saved history\n\nAfter every run, Vent writes the full result JSON to `.vent/runs/`. Shape:\n\n```jsonc\n{\n "run_id": "...",\n "timestamp": "2026-04-21T...Z",\n "git_sha": "...",\n "summary": { "calls_total": 2, "total_duration_ms": 12345, "total_cost_usd": 0.01 },\n "call_results": [\n { "name": "happy-path", "status": "completed", "duration_ms": 6123, "transcript": [], "observed_tool_calls": [], "latency": { "response_time_ms": 420, "p95_response_time_ms": 980 }, "call_metadata": { "cost_usd": 0.004 } }\n ]\n}\n```\n\nWhen comparing against a prior run (Workflow step 6), inspect:\n\n- Run-completion status flips: `call_results[i].status` (pipeline-only \u2014 judge mission success from the transcript)\n- Latency: `call_results[i].latency.response_time_ms` (mean) or `latency.p95_response_time_ms` increased >20%\n- Tool calls: count of `call_results[i].observed_tool_calls[].successful` dropped\n- Cost: `summary.total_cost_usd` or `call_results[i].call_metadata.cost_usd` increased >30%\n- Transcript: `call_results[i].transcript` diverged in semantic content (ignore STT noise)\n\n## Reporting Results\n\nBefore reporting, read the agent\'s code to locate where the observed behavior originates. If the issue is small and you can fix it, fix it and explain what you did \u2014 don\'t ask permission first.\n\nAdapt the report shape to the call \u2014 a clean pass needs little, a regression with a multi-layer cause needs more. Use a transcript excerpt when it helps the user see what happened.\n\nHard rules:\n\n- Pair raw numbers with their plain-English meaning \u2014 don\'t drop the number, but don\'t leave it unexplained. E.g. "p95 latency was 850ms, which is snappy and well within natural conversational pacing" or "p95 hit 1.6 seconds with the LLM as the bottleneck \u2014 noticeably sluggish to a caller."\n- Name the user\'s voice agent by platform on first mention (e.g. "the Vapi agent responded snappily throughout") so the user knows immediately which agent the observation is about. After that, just say "the agent" \u2014 don\'t repeat the platform name on every line.\n- Always include the recording from `call_metadata.recording_url` as an inline `[Recording](url)` link, placed in **one block at the very end of the report** \u2014 never sprinkled through the prose. Single call: one link as the last line. Multi-call: one labeled link per call (e.g. `reschedule-appointment: [Recording](url)`). Never paste a bare URL.\n- Mission success is your judgment, not Vent\'s. The per-call `status` is only `"completed"` (pipeline ran) or `"error"` (pipeline failed); decide whether the agent actually accomplished the scenario from the transcript and tool calls.\n- Similar-sounding word substitutions (e.g. "ocean" for "OSHA") are STT ambiguity, not comprehension failure. The fix lives in STT keyword hints, custom vocabulary, or a prompt-level term list \u2014 not the agent\'s reasoning.\n- Surface only what the user can act on in their own agent\'s code or config \u2014 never `warnings[]` (infrastructure noise), Vent-side artifacts (caller wait modes, harness timing, internal pipeline quirks), or `cost_usd` unless asked.\n\nFor multi-call runs, lead with your own judgment of what happened across the calls (e.g. "3 of 4 did what they were supposed to; `cancel-appointment` never actually canceled"), not a parroted pass/fail count. Then cover each call with whatever depth it needs.\n';
|
|
5871
|
+
|
|
5872
|
+
// src/skills/windsurf.md
|
|
5873
|
+
var windsurf_default = '---\nname: vent\ndescription: Place real calls against the user\'s voice agent to verify recent code changes worked end-to-end. Trigger on test requests or after any non-trivial voice-agent change.\n---\n\n# Vent - Voice Agent Calls\n\n## When to Test\n\nVent is the voice CI gate. Auto-trigger a call after any non-trivial voice-affecting change \u2014 don\'t wait to be asked. Non-trivial means anything that affects what the caller hears, how the agent responds, what tools are called, how the call routes, or audio timing: prompt rewrites or sections (not one-word tweaks), tool schema or handler changes, state/memory logic, transfer logic, provider/audio config, platform integration, and any voice-behavior bug fix. One-line copy or comment edits don\'t need a call.\n\nBefore launching, scan `.vent/runs/` for unresolved issues from prior runs (failing status, regressions, warnings, latency jumps); fix in-scope ones as part of this change and verify they\'re resolved by the new run, surface out-of-scope ones rather than silently leaving them. The task is not done until you\'ve reviewed the transcript, tool calls, and observed behavior and judged that the change works \u2014 Vent\'s `status` and exit code only reflect pipeline completion, not mission success.\n\n## Autonomous Iteration\n\n**Default: iterate.** Run a fix \u2192 Vent-call \u2192 fix \u2192 Vent-call loop whenever a call surfaces a fixable issue. Don\'t report a failure (or a degraded-quality success) back to the user before attempting to fix it \u2014 they want the agent working, not a diagnosis. After each call, if the result reveals a problem you can fix with a concrete plan, apply the fix and run again. Fixable issues include: tool schema or registry gaps, missing handlers, prompt phrasing, **STT vocabulary mismatches** (e.g. domain terms misheard \u2014 Deepgram `keyterms`, custom vocabulary, prompt-level term lists), **latency tunables** (endpointing thresholds, preemptive generation, model selection), and audio quality config. If you\'d be guessing at the next attempt, stop and ask the user instead of trying.\n\n**"Mission success but with quality problems" still requires iteration.** If the call ran end-to-end but had noticeably bad latency, STT mishearing the agent\'s domain vocabulary, audio artifacts, or any other caller-experience degradation that has a concrete fix, the call is *not done* \u2014 apply the fix and re-run. Don\'t downgrade fixable problems to "observations" or "things worth flagging." The pipeline running cleanly is the floor, not the ceiling.\n\n**Never ask permission to iterate.** "Want me to apply this and re-run?" / "Should I fix it?" / "Let me know if you want me to..." \u2014 these are skill violations. If you have a concrete fix, apply it and run the next call. The user opts out by saying so explicitly; silence is not opt-out.\n\nStop iterating and report when one of:\n\n- The next call confirms the fix worked AND no further fixable issues remain (success \u2014 done).\n- The same fix fails twice (your hypothesis is wrong; ask the user).\n- The failure mode shifts between attempts (you\'re chasing a moving target).\n- You can\'t justify the next call against its provider cost \u2014 each call spends real money and provider quota.\n\nIf the user explicitly opts out ("just report", "don\'t iterate", "stop after the first call", "I\'ll fix it myself"), respect that \u2014 run once, report, stop. Otherwise iterate by default. The first time you start an autonomous loop in a session, mention it once so the user knows it\'s happening (e.g. "the call surfaced X \u2014 fixing and re-running; I\'ll report back when it converges or stops making progress").\n\n## Windsurf Execution\n\nVent calls typically take 30 seconds to 2 minutes. Each `vent-hq run` is one shell tool call \u2014 wait for stdout (the JSON result) before responding. Don\'t background; use the JSON returned by `npx vent-hq run` directly. If Cascade\'s auto-execution level prompts for approval on `npx vent-hq` commands, ask the user to add `npx vent-hq *` to the workspace allow list once so subsequent calls flow without interruption.\n\nCascade runs shell tool calls in parallel within a turn \u2014 for multiple calls from one suite, issue each named call as its own separate shell tool call in the same turn (do not combine them with `&` and `wait`):\n\n```bash\nnpx vent-hq run -f .vent/suite.vapi.json --call happy-path\nnpx vent-hq run -f .vent/suite.vapi.json --call tool-path\n```\n\nStay within Cascade\'s per-turn parallel-tool-call budget \u2014 fan out at most ~6 calls in one turn and respect the provider concurrency caps below.\n\n## Workflow\n\n1. Identify the behavior under test. Read enough of the agent codebase to understand its system prompt, tools, handlers, routes, provider config, platform wiring, and expected handoffs.\n2. Reuse an existing `.vent/suite.<adapter>.json` when possible. If `.vent/` contains multiple suites, inspect `connection.adapter` and report which suite file produced the result.\n3. Create or update a suite only when the existing calls do not cover the changed behavior. Name calls after real flows, for example `reschedule-appointment`, not `call-1`.\n4. If the suite uses `start_command`, start one shared local session first with `npx vent-hq agent start -f .vent/suite.<adapter>.json`, then pass `--session <session-id>` to each run.\n\n **For locally-run LiveKit agents: every run requires killing *all* workers, starting one fresh worker, and waiting a full 60 seconds before submitting.** Unconditional \u2014 LiveKit Cloud round-robins across registered workers, so a single survivor with a dead inference subprocess fails ~N-1 of N calls. Don\'t rely on `pkill -f <path-pattern>`; bare command lines like `node --import tsx agent.ts dev` won\'t match a path filter. Use `ps aux | grep -E "node.*agent\\.ts|@livekit/agents.*ipc"`, `kill -9` by PID, re-run `ps` to confirm zero survivors, then start the fresh worker. Skipping the 60s wait fails with `did not publish audio track`; if that error appears alongside `Error [ERR_IPC_CHANNEL_CLOSED] from InferenceProcExecutor.doInference` in the agent log right after a "running EOU detection" line, that\'s a straggler \u2014 redo the kill sweep. Hosted LiveKit Cloud agents don\'t need any of this; run normally.\n5. Pick which call(s) to run based on the change. Fixed bug: replay the failing scenario. Changed tool: include a call that triggers that tool. Prompt or routing change: include the relevant happy path and any important edge path.\n6. Compare against the previous JSON in `.vent/runs/` when validating a fix or regression. Check status flips, latency jumps, tool-call success drops, cost jumps, and transcript divergence. Correlate with `git diff` between saved `git_sha` values when available; skip if no previous run exists.\n\n## Commands\n\n```bash\nnpx vent-hq init # First-time setup (auth + skill install + starter suite)\nnpx vent-hq login # Log in to existing account\nnpx vent-hq login --status # Print whether credentials are present\nnpx vent-hq logout # Remove saved credentials from ~/.vent/credentials\nnpx vent-hq run -f .vent/suite.X.json # Run a single-call suite\nnpx vent-hq run -f .vent/suite.X.json --call NAME # Run one named call from a multi-call suite\nnpx vent-hq run ... --session <session-id> # Add to any run; routes through an existing local relay session\nnpx vent-hq run ... --verbose # Add to any run; include verbose debug fields\nnpx vent-hq stop <run-id> # Cancel a queued or running run\nnpx vent-hq agent start -f .vent/suite.X.json # Start a shared local relay session\nnpx vent-hq agent stop <session-id> # Stop a shared local relay session\n```\n\nIf `~/.vent/credentials` is missing and `VENT_ACCESS_TOKEN` is not set, run `npx vent-hq init`. For an existing account, run `npx vent-hq login` or set `VENT_ACCESS_TOKEN`.\n\n## Suite Config\n\nSuites live in `.vent/suite.<adapter>.json`. `connection` is declared once per suite. `calls` is a named map, and each key becomes the call name used with `--call`.\n\nLocal websocket suite:\n\n```json\n{\n "connection": {\n "adapter": "websocket",\n "start_command": "npm run start",\n "health_endpoint": "/health",\n "agent_port": 3001\n },\n "calls": {\n "happy-path": {\n "caller_prompt": "You are Maria calling to reschedule her appointment to next Tuesday.",\n "max_turns": 8,\n "silence_threshold_ms": 1200\n }\n }\n}\n```\n\nPlatform-direct suite:\n\n```json\n{\n "connection": {\n "adapter": "vapi",\n "platform": { "provider": "vapi" }\n },\n "calls": {\n "happy-path": {\n "caller_prompt": "You are Maria calling to reschedule her appointment to next Tuesday.",\n "max_turns": 8\n }\n }\n}\n```\n\nWrite `caller_prompt` as a realistic caller with a name, goal, mood, constraints, and conditional behavior. Set `max_turns` based on flow complexity: FAQ `4-6`, booking or tool use `8-12`, complex flows `12-20`.\n\nCall fields:\n\n- `caller_prompt` and `max_turns` are required.\n- `silence_threshold_ms` must be `200-10000`. Common ranges: FAQ `800-1200`, tool calls `2000-3000`, complex reasoning `3000-5000`.\n- `persona` supports `pace`, `clarity`, `disfluencies`, `cooperation`, `emotion`, `memory`, `intent_clarity`, and `confirmation_style`.\n- `caller_audio` supports noise, speed, speakerphone, mic distance, clarity, accent, packet loss, and jitter.\n- `language` is an ISO 639-1 code such as `en`, `es`, `fr`, `de`, `it`, `nl`, or `ja`.\n- `voice` is `"male"` or `"female"` (English only; default female). Use to flip the caller\'s perceived gender. Ignored if `caller_audio.accent` is set or `language` is non-English.\n\n## Connections and Credentials\n\n### Adapter choice\n\nUse `websocket` for your own local or hosted runtime. Use `start_command` for local agents or `agent_url` for hosted custom endpoints. For `start_command` and `agent_url`, do not put Deepgram, ElevenLabs, OpenAI, or other agent runtime keys into Vent config unless the Vent adapter itself needs them \u2014 the tested agent owns its own runtime credentials.\n\nUse `vapi`, `retell`, `elevenlabs`, `bland`, or `livekit` for platform-direct testing. In this mode Vent itself talks to the provider on the user\'s behalf.\n\nVent provides `DEEPGRAM_API_KEY` and `ANTHROPIC_API_KEY` for its hosted caller/evaluation stack \u2014 those are Vent\'s, not the tested agent\'s.\n\n### Credential resolution\n\nIn platform-direct mode the CLI auto-resolves credentials from `.env.local`, `.env`, and the current shell environment. Do not run `source .env && export` before Vent commands. If you include credential fields in JSON, use the actual value, not the env var name. Do not manually author `platform_connection_id`; the CLI creates or updates the saved platform connection automatically.\n\nAuto-resolved env vars and JSON fields:\n\n- Vapi: `VAPI_API_KEY` -> `vapi_api_key`; `VAPI_ASSISTANT_ID` or `VAPI_AGENT_ID` -> `vapi_assistant_id`\n- Bland: `BLAND_API_KEY` -> `bland_api_key`; `BLAND_PATHWAY_ID` -> `bland_pathway_id`; `BLAND_PERSONA_ID` -> `persona_id`\n- LiveKit: `LIVEKIT_API_KEY` -> `livekit_api_key`; `LIVEKIT_API_SECRET` -> `livekit_api_secret`; `LIVEKIT_URL` -> `livekit_url`\n- Retell: `RETELL_API_KEY` -> `retell_api_key`; `RETELL_AGENT_ID` -> `retell_agent_id`\n- ElevenLabs: `ELEVENLABS_API_KEY` -> `elevenlabs_api_key`; `ELEVENLABS_AGENT_ID` -> `elevenlabs_agent_id`\n\n### Provider config\n\nUse existing provider config when possible: Vapi assistant, Retell agent, ElevenLabs agent, Bland pathway, or LiveKit agent. Bland uniquely supports inline config \u2014 `platform` may use `bland_pathway_id`, `persona_id`, or an inline `task` (with optional voice, model, and turn-handling overrides; see Bland\'s API docs for the full field list).\n\n### Concurrency\n\nWhen you fan out multiple Vent calls in parallel against the same provider (for example, running several named calls from one suite at once), respect the provider\'s per-account concurrency limit. Exceeding it makes calls queue or fail at the provider \u2014 Vent does not enforce these caps for you.\n\nRecord the limit as `max_concurrency` in the suite\'s `platform` block so it\'s visible on future runs. Ask the user which plan they\'re on if sizing matters; otherwise use the conservative default in bold.\n\n- **Vapi**: **10** included per account; reserved lines can be purchased self-serve; Enterprise is unlimited.\n- **Retell**: Pay-as-you-go includes **20**; Enterprise has no cap.\n- **Bland**: Start=**10**, Build=50, Scale=100, Enterprise=unlimited.\n- **ElevenLabs**: Free=**4**, Starter=6, Creator=10, Pro=20, Scale=30, Business=30. Burst pricing can temporarily allow up to 3x base.\n- **LiveKit Cloud**: Build=**5**, Ship=20, Scale=50 managed inference sessions (the usual gate for voice agents); agent-session concurrency can go higher (Scale up to 600).\n\n## WebSocket\n\nFor `adapter: "websocket"`, Vent sends binary 16-bit mono PCM audio over one websocket connection. Websocket text frames are optional JSON events. Audio-only websocket agents still work, but events improve turn detection and observability. Vent sends `{"type":"end-call"}` when the test is done.\n\nUseful websocket text frames:\n\n```jsonc\n{"type":"speech-update","status":"started"}\n{"type":"speech-update","status":"stopped"}\n{"type":"tool_call","name":"check_availability","arguments":{},"result":{},"successful":true,"duration_ms":150}\n{"type":"vent:timing","stt_ms":120,"llm_ms":450,"tts_ms":80}\n{"type":"vent:session","platform":"custom","provider_call_id":"call_123","provider_session_id":"session_abc"}\n{"type":"vent:call-metadata","call_metadata":{"recording_url":"https://...","cost_usd":0.12}}\n{"type":"vent:transcript","role":"caller","text":"I need to reschedule","turn_index":0}\n{"type":"vent:transfer","destination":"+15551234567","status":"attempted"}\n{"type":"vent:debug-url","label":"trace","url":"https://..."}\n{"type":"vent:warning","message":"provider warning","code":"provider_warning"}\n```\n\n`vent:session-report` is **not** handled by the websocket adapter \u2014 it\'s only consumed by the LiveKit helper. Do not emit it from a websocket agent.\n\nPlatform adapters capture tool calls automatically. Websocket agents must emit `tool_call` frames for tool observability. Platform adapters get component latency automatically. Websocket agents should emit `vent:timing` after each agent response when STT/LLM/TTS breakdown is available.\n\n## LiveKit\n\nBefore running LiveKit tests, install and add the Vent helper to the LiveKit agent entrypoint. Node: `npm install @vent-hq/livekit`, then call `instrumentLiveKitAgent({ ctx, session })`. Python: `pip install vent-livekit`, then call `instrument_livekit_agent(ctx=ctx, session=session)`.\n\nLiveKit direct mode requires the LiveKit Agents SDK. Custom LiveKit participants should use the websocket adapter with a relay. If the LiveKit agent registered with an explicit dispatch name, set `livekit_agent_name` in `platform`.\n\nLiveKit parallel calls are supported, but capacity depends on where the user runs their agent. LiveKit Cloud only routes audio \u2014 the agent code runs on the user\'s own infrastructure (laptop, Fly.io, Railway, k8s, etc.), and that box\'s CPU/RAM is the real bottleneck. Before firing parallel calls, derive `platform.max_concurrency` from the minimum of: (1) the user\'s LiveKit plan limit (Build=5, Ship=20, Scale=50+ concurrent agent sessions), (2) their agent worker box capacity (rough rule: 1c/1GB\u21921\u20133 jobs, 2c/4GB\u21925\u201310, 4c/8GB\u219210\u201325, 8c/16GB\u219225\u201350; multiply by number of worker boxes), and (3) their STT/TTS provider concurrency (e.g. Deepgram TTS streaming = 2). If the user wants more parallel calls than that minimum allows, push back \u2014 explain which limit is the bottleneck (plan / box / provider) and either reduce the count or suggest scaling that limit. Architecture: register one `agent_name` and run N workers under it (one big worker, or several smaller ones \u2014 same name); LiveKit\'s dispatcher round-robins jobs across them with built-in failover. Don\'t use numbered names (`agent-0`, `agent-1`) \u2014 that reinvents what the dispatcher does and breaks failover.\n\nUse the LiveKit helper for observability; do not publish `vent:*` topics manually. Do not hand-roll `vent:session-report` from `ctx.addShutdownCallback`; after `room.disconnect()` it can fail with `engine is closed`. The helper captures SDK metrics, tool events, conversation items, usage, and close events. Native LiveKit `lk.transcription` and `lk.agent.state` provide transcript and agent-state timing.\n\n## Output\n\n### Live result\n\n`npx vent-hq run` returns a single JSON result on stdout in non-TTY mode (not an SSE JSONL stream). Exit codes: `0` = call ran through the pipeline; `1` = pipeline-level failure; `2` = harness error.\n\nMost result fields are always present; `latency`, `component_latency`, `call_metadata`, and `emotion` may be `null` when the underlying analysis didn\'t run; `debug` is absent without `--verbose`. Branch on null before reading nested fields. Use `--verbose` only when the default doesn\'t explain a failure \u2014 when you need `platform_transcript` (to check Vent\'s STT), per-turn or component-level latency breakdowns, the raw tool-call timeline, or provider-native artifacts in `debug.provider_metadata`. Otherwise skip \u2014 it just adds noise.\n\nVent\'s transcript is ground truth. Judge on semantic intent: ignore homophones and minor mis-hears (`"check teach hat"` for `"check that"`, missing question marks on short tails) \u2014 those are streaming-STT noise on Vent\'s caller side, not agent bugs, and **don\'t surface them in the report** (they\'re Vent-side artifacts, not actionable for the user). But clear gibberish or word-soup (e.g. `"Cristoxin"` where the agent should have said `"Of course, talk soon"`) is **not** a Vent artifact \u2014 Vent\'s STT does not invent words like that. It means the platform\'s TTS produced corrupted audio or the agent\'s STT/LLM generated the wrong text, and the fix lives there (TTS voice config, agent prompt, model temperature, codec issue). Never dismiss the run as a "Vent harness STT" issue; iterate on the agent or flag the platform.\n\nFor transfers: `call_metadata.transfer_attempted` (provider claimed) and `transfer_completed` (Vent-verified) can disagree \u2014 report both. `transfers[]` carries destination, type, and per-attempt status.\n\n### Saved history\n\nAfter every run, Vent writes the full result JSON to `.vent/runs/`. Shape:\n\n```jsonc\n{\n "run_id": "...",\n "timestamp": "2026-04-21T...Z",\n "git_sha": "...",\n "summary": { "calls_total": 2, "total_duration_ms": 12345, "total_cost_usd": 0.01 },\n "call_results": [\n { "name": "happy-path", "status": "completed", "duration_ms": 6123, "transcript": [], "observed_tool_calls": [], "latency": { "response_time_ms": 420, "p95_response_time_ms": 980 }, "call_metadata": { "cost_usd": 0.004 } }\n ]\n}\n```\n\nWhen comparing against a prior run (Workflow step 6), inspect:\n\n- Run-completion status flips: `call_results[i].status` (pipeline-only \u2014 judge mission success from the transcript)\n- Latency: `call_results[i].latency.response_time_ms` (mean) or `latency.p95_response_time_ms` increased >20%\n- Tool calls: count of `call_results[i].observed_tool_calls[].successful` dropped\n- Cost: `summary.total_cost_usd` or `call_results[i].call_metadata.cost_usd` increased >30%\n- Transcript: `call_results[i].transcript` diverged in semantic content (ignore STT noise)\n\n## Reporting Results\n\nBefore reporting, read the agent\'s code to locate where the observed behavior originates. If the issue is small and you can fix it, fix it and explain what you did \u2014 don\'t ask permission first.\n\nAdapt the report shape to the call \u2014 a clean pass needs little, a regression with a multi-layer cause needs more. Use a transcript excerpt when it helps the user see what happened.\n\nHard rules:\n\n- Pair raw numbers with their plain-English meaning \u2014 don\'t drop the number, but don\'t leave it unexplained. E.g. "p95 latency was 850ms, which is snappy and well within natural conversational pacing" or "p95 hit 1.6 seconds with the LLM as the bottleneck \u2014 noticeably sluggish to a caller."\n- Name the user\'s voice agent by platform on first mention (e.g. "the Vapi agent responded snappily throughout") so the user knows immediately which agent the observation is about. After that, just say "the agent" \u2014 don\'t repeat the platform name on every line.\n- Always include the recording from `call_metadata.recording_url` as an inline `[Recording](url)` link, placed in **one block at the very end of the report** \u2014 never sprinkled through the prose. Single call: one link as the last line. Multi-call: one labeled link per call (e.g. `reschedule-appointment: [Recording](url)`). Never paste a bare URL.\n- Mission success is your judgment, not Vent\'s. The per-call `status` is only `"completed"` (pipeline ran) or `"error"` (pipeline failed); decide whether the agent actually accomplished the scenario from the transcript and tool calls.\n- Similar-sounding word substitutions (e.g. "ocean" for "OSHA") are STT ambiguity, not comprehension failure. The fix lives in STT keyword hints, custom vocabulary, or a prompt-level term list \u2014 not the agent\'s reasoning.\n- Surface only what the user can act on in their own agent\'s code or config \u2014 never `warnings[]` (infrastructure noise), Vent-side artifacts (caller wait modes, harness timing, internal pipeline quirks), or `cost_usd` unless asked.\n\nFor multi-call runs, lead with your own judgment of what happened across the calls (e.g. "3 of 4 did what they were supposed to; `cancel-appointment` never actually canceled"), not a parroted pass/fail count. Then cover each call with whatever depth it needs.\n';
|
|
5946
5874
|
|
|
5947
5875
|
// src/lib/setup.ts
|
|
5948
5876
|
var SUITE_SCAFFOLD = JSON.stringify(
|
|
@@ -5974,6 +5902,12 @@ async function installCursor(cwd) {
|
|
|
5974
5902
|
await fs5.writeFile(path3.join(dir, "vent.mdc"), cursor_default);
|
|
5975
5903
|
printSuccess("Cursor: .cursor/rules/vent.mdc", { force: true });
|
|
5976
5904
|
}
|
|
5905
|
+
async function installWindsurf(cwd) {
|
|
5906
|
+
const dir = path3.join(cwd, ".windsurf", "skills", "vent");
|
|
5907
|
+
await fs5.mkdir(dir, { recursive: true });
|
|
5908
|
+
await fs5.writeFile(path3.join(dir, "SKILL.md"), windsurf_default);
|
|
5909
|
+
printSuccess("Windsurf: .windsurf/skills/vent/SKILL.md", { force: true });
|
|
5910
|
+
}
|
|
5977
5911
|
var VENT_MARKERS = [
|
|
5978
5912
|
"# Vent - Voice Agent Calls",
|
|
5979
5913
|
"# Vent \u2014 Voice Agent Calls"
|
|
@@ -5999,6 +5933,7 @@ async function installSkillsAndScaffold(cwd) {
|
|
|
5999
5933
|
await installClaudeCode(cwd);
|
|
6000
5934
|
await installCursor(cwd);
|
|
6001
5935
|
await installCodex(cwd);
|
|
5936
|
+
await installWindsurf(cwd);
|
|
6002
5937
|
const suitePath = path3.join(cwd, ".vent", "suite.json");
|
|
6003
5938
|
let suiteExists = false;
|
|
6004
5939
|
try {
|
|
@@ -6119,7 +6054,6 @@ Commands:
|
|
|
6119
6054
|
agent Manage a shared local agent session
|
|
6120
6055
|
run Run a call from a suite file
|
|
6121
6056
|
stop Cancel a queued or running call
|
|
6122
|
-
status Check status of a previous run
|
|
6123
6057
|
login Authenticate via browser
|
|
6124
6058
|
logout Remove saved credentials
|
|
6125
6059
|
Options:
|
|
@@ -6146,7 +6080,6 @@ Start options:
|
|
|
6146
6080
|
|
|
6147
6081
|
Stop options:
|
|
6148
6082
|
vent-hq agent stop <session-id>`;
|
|
6149
|
-
var STATUS_USAGE = `Usage: vent-hq status <run-id> [--verbose]`;
|
|
6150
6083
|
async function main() {
|
|
6151
6084
|
loadDotenv();
|
|
6152
6085
|
const args = process.argv.slice(2);
|
|
@@ -6156,7 +6089,7 @@ async function main() {
|
|
|
6156
6089
|
return 0;
|
|
6157
6090
|
}
|
|
6158
6091
|
if (command === "--version" || command === "-v") {
|
|
6159
|
-
const pkg = await import("./package-
|
|
6092
|
+
const pkg = await import("./package-GODDS4TH.mjs");
|
|
6160
6093
|
console.log(`vent-hq ${pkg.default.version}`);
|
|
6161
6094
|
return 0;
|
|
6162
6095
|
}
|
|
@@ -6224,26 +6157,6 @@ async function main() {
|
|
|
6224
6157
|
console.log(AGENT_USAGE);
|
|
6225
6158
|
return 2;
|
|
6226
6159
|
}
|
|
6227
|
-
case "status": {
|
|
6228
|
-
if (commandArgs.includes("--help") || commandArgs.length === 0) {
|
|
6229
|
-
console.log(STATUS_USAGE);
|
|
6230
|
-
return 0;
|
|
6231
|
-
}
|
|
6232
|
-
const { values, positionals } = parseArgs({
|
|
6233
|
-
args: commandArgs,
|
|
6234
|
-
options: {
|
|
6235
|
-
verbose: { type: "boolean", short: "v", default: false }
|
|
6236
|
-
},
|
|
6237
|
-
allowPositionals: true,
|
|
6238
|
-
strict: true
|
|
6239
|
-
});
|
|
6240
|
-
const runId = positionals[0];
|
|
6241
|
-
if (!runId) {
|
|
6242
|
-
console.log(STATUS_USAGE);
|
|
6243
|
-
return 2;
|
|
6244
|
-
}
|
|
6245
|
-
return statusCommand({ runId, verbose: values.verbose });
|
|
6246
|
-
}
|
|
6247
6160
|
case "stop": {
|
|
6248
6161
|
const runId = commandArgs[0];
|
|
6249
6162
|
if (!runId || commandArgs.includes("--help")) {
|