vent-hq 0.13.0 → 0.13.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # vent-hq
2
2
 
3
- **Agent CLI for voice AI development.** Lets coding agents (Claude Code, Cursor, Codex) place real calls against your voice agent and read back transcripts, latency, audio, tool calls, and 60+ computed metrics — so they can iterate on prompts, flows, and platform config based on what actually happened.
3
+ **Agent CLI for voice AI development.** Lets coding agents (Claude Code, Cursor, Codex, Windsurf) place real calls against your voice agent and read back transcripts, latency, audio, tool calls, and 60+ computed metrics — so they can iterate on prompts, flows, and platform config based on what actually happened.
4
4
 
5
5
  Works with **Vapi, Retell, LiveKit, ElevenLabs, Bland, and custom WebSocket endpoints**.
6
6
 
@@ -24,7 +24,7 @@ npx vent-hq@latest init
24
24
 
25
25
  `init` will:
26
26
  - Authenticate via GitHub (if you have `gh` installed) or open a browser for device-code auth
27
- - Install skill files for Claude Code (`.claude/skills/vent/SKILL.md`), Cursor (`.cursor/rules/vent.mdc`), and Codex (`AGENTS.md`)
27
+ - Install skill files for Claude Code (`.claude/skills/vent/SKILL.md`), Cursor (`.cursor/rules/vent.mdc`), Codex (`AGENTS.md`), and Windsurf (`.windsurf/skills/vent/SKILL.md`)
28
28
  - Scaffold a starter suite at `.vent/suite.json`
29
29
 
30
30
  After `init`, your coding agent reads the skill file and takes over from there.
@@ -61,7 +61,6 @@ Swap `adapter` for `retell`, `livekit`, `elevenlabs`, `bland`, or `websocket` to
61
61
  | `vent-hq init` | One-time setup: auth, skill files, starter suite |
62
62
  | `vent-hq run -f <suite.json>` | Run a call (or all calls) from a suite, stream results |
63
63
  | `vent-hq run -f <s> --call <name>` | Run a single named call |
64
- | `vent-hq status <run-id>` | Check or stream the status of a previous run |
65
64
  | `vent-hq stop <run-id>` | Cancel a queued or running call |
66
65
  | `vent-hq agent start -f <s>` | Keep a relay session open for a local WebSocket agent |
67
66
  | `vent-hq login` / `logout` | Manage credentials |
package/dist/index.mjs CHANGED
@@ -4157,7 +4157,6 @@ var ConversationCallSpecSchema = external_exports.object({
4157
4157
  max_turns: external_exports.number().int().min(1).max(50).default(6),
4158
4158
  silence_threshold_ms: external_exports.number().int().min(200).max(1e4).optional(),
4159
4159
  persona: CallerPersonaSchema,
4160
- prosody: external_exports.boolean().optional(),
4161
4160
  caller_audio: CallerAudioEffectsSchema.optional(),
4162
4161
  /** ISO 639-1 language code for multilingual calls (e.g., "es", "fr", "de"). Caller speaks this language, STT transcribes it, judge evaluates in it. */
4163
4162
  language: external_exports.string().min(2).max(5).optional(),
@@ -4224,7 +4223,7 @@ var LiveKitPlatformSchema = BasePlatformSchema.extend({
4224
4223
  livekit_api_secret: external_exports.string().optional(),
4225
4224
  livekit_url: external_exports.string().optional(),
4226
4225
  livekit_agent_name: external_exports.string().optional()
4227
- });
4226
+ }).strict();
4228
4227
  var VapiPlatformSchema = BasePlatformSchema.extend({
4229
4228
  provider: external_exports.literal("vapi"),
4230
4229
  vapi_api_key: external_exports.string().optional(),
@@ -4298,25 +4297,6 @@ var LatencyMetricsSchema = external_exports.object({
4298
4297
  mouth_to_ear_est_ms: external_exports.number().optional(),
4299
4298
  drift_slope_ms_per_turn: external_exports.number().optional()
4300
4299
  });
4301
- var TurnEmotionProfileSchema = external_exports.object({
4302
- turn_index: external_exports.number().int().min(0),
4303
- emotions: external_exports.record(external_exports.string(), external_exports.number()),
4304
- calmness: external_exports.number(),
4305
- confidence: external_exports.number(),
4306
- frustration: external_exports.number(),
4307
- warmth: external_exports.number(),
4308
- uncertainty: external_exports.number()
4309
- });
4310
- var ProsodyMetricsSchema = external_exports.object({
4311
- per_turn: external_exports.array(TurnEmotionProfileSchema),
4312
- mean_calmness: external_exports.number(),
4313
- mean_confidence: external_exports.number(),
4314
- peak_frustration: external_exports.number(),
4315
- emotion_consistency: external_exports.number(),
4316
- naturalness: external_exports.number(),
4317
- emotion_trajectory: external_exports.enum(["stable", "improving", "degrading", "volatile"]),
4318
- hume_latency_ms: external_exports.number()
4319
- });
4320
4300
  var HarnessOverheadSchema = external_exports.object({
4321
4301
  tts_per_turn_ms: external_exports.array(external_exports.number()),
4322
4302
  stt_per_turn_ms: external_exports.array(external_exports.number()),
@@ -4398,7 +4378,6 @@ var ConversationMetricsSchema = external_exports.object({
4398
4378
  latency: LatencyMetricsSchema.optional(),
4399
4379
  tool_calls: ToolCallMetricsSchema.optional(),
4400
4380
  signal_quality: SignalQualityMetricsSchema.optional(),
4401
- prosody: ProsodyMetricsSchema.optional(),
4402
4381
  harness_overhead: HarnessOverheadSchema.optional(),
4403
4382
  component_latency: ComponentLatencyMetricsSchema.optional()
4404
4383
  });
@@ -4457,7 +4436,6 @@ function formatConversationResult(raw, options = {}) {
4457
4436
  const warnings = dedupeStrings([
4458
4437
  ...formatProviderWarningMessages(r.call_metadata?.provider_warnings)
4459
4438
  ]);
4460
- const emotion = r.metrics?.prosody ? formatEmotion(r.metrics.prosody) : null;
4461
4439
  const result = {
4462
4440
  name: r.name ?? null,
4463
4441
  status: r.status,
@@ -4470,7 +4448,6 @@ function formatConversationResult(raw, options = {}) {
4470
4448
  call_metadata: formatCallMetadata(r.call_metadata, verbose)
4471
4449
  };
4472
4450
  if (warnings.length > 0) result.warnings = warnings;
4473
- if (emotion) result.emotion = emotion;
4474
4451
  if (verbose) result.caller_prompt = r.caller_prompt;
4475
4452
  if (debug2) result.debug = debug2;
4476
4453
  return result;
@@ -4577,15 +4554,6 @@ function stripExecutionMessage(args) {
4577
4554
  const { execution_message: _drop, ...rest } = args;
4578
4555
  return rest;
4579
4556
  }
4580
- function formatEmotion(prosody) {
4581
- return {
4582
- naturalness: prosody.naturalness,
4583
- mean_calmness: prosody.mean_calmness,
4584
- mean_confidence: prosody.mean_confidence,
4585
- peak_frustration: prosody.peak_frustration,
4586
- emotion_trajectory: prosody.emotion_trajectory
4587
- };
4588
- }
4589
4557
  function formatComponentLatency(cl, verbose) {
4590
4558
  if (!cl) return null;
4591
4559
  const result = {
@@ -4648,7 +4616,6 @@ function formatDebug(result) {
4648
4616
  const debug2 = compactUnknownRecord({
4649
4617
  signal_quality: result.metrics?.signal_quality,
4650
4618
  harness_overhead: result.metrics?.harness_overhead,
4651
- prosody: result.metrics?.prosody,
4652
4619
  provider_warnings: nonEmptyArray(result.call_metadata?.provider_warnings),
4653
4620
  component_latency_per_turn: nonEmptyArray(result.metrics?.component_latency?.per_turn),
4654
4621
  observed_tool_calls: formatDebugToolCalls(result.observed_tool_calls),
@@ -4710,9 +4677,6 @@ function stdoutSync(data) {
4710
4677
  }
4711
4678
  }
4712
4679
  }
4713
- function writeJsonStdout(value) {
4714
- stdoutSync(JSON.stringify(value, null, 2) + "\n");
4715
- }
4716
4680
  var bold = (s) => isTTY ? `\x1B[1m${s}\x1B[0m` : s;
4717
4681
  var dim = (s) => isTTY ? `\x1B[2m${s}\x1B[0m` : s;
4718
4682
  var green = (s) => isTTY ? `\x1B[32m${s}\x1B[0m` : s;
@@ -4819,7 +4783,7 @@ function printSummary(callResults, runComplete, runId, options = {}) {
4819
4783
  stdoutSync(" " + parts.join(" ") + "\n");
4820
4784
  }
4821
4785
  }
4822
- stdoutSync(dim(`Full details: vent status ${runId}${options.verbose ? " --verbose" : ""}`) + "\n");
4786
+ stdoutSync(dim(`Run ID: ${runId}`) + "\n");
4823
4787
  }
4824
4788
  function buildRunSummaryJson(options) {
4825
4789
  const calls = options.rawCalls ? formatRawCalls(options.rawCalls, options.verbose ?? false) : options.formattedCalls ?? [];
@@ -5788,45 +5752,6 @@ function findFreePort() {
5788
5752
  });
5789
5753
  }
5790
5754
 
5791
- // src/commands/status.ts
5792
- async function statusCommand(args) {
5793
- const accessToken = await loadAccessToken();
5794
- if (!accessToken) {
5795
- printError("No Vent access token found. Run `npx vent-hq init` first.");
5796
- return 2;
5797
- }
5798
- try {
5799
- const res = await apiFetch(`/runs/${args.runId}`, accessToken);
5800
- const data = await res.json();
5801
- const aggregate = data.aggregate;
5802
- const counts = aggregate?.conversation_calls;
5803
- const results = Array.isArray(data.results) ? data.results : [];
5804
- const summary = buildRunSummaryJson({
5805
- runId: typeof data.id === "string" ? data.id : args.runId,
5806
- status: data.status,
5807
- total: counts?.total,
5808
- passed: counts?.passed,
5809
- failed: counts?.failed,
5810
- rawCalls: results,
5811
- verbose: args.verbose,
5812
- runDetails: {
5813
- created_at: data.created_at,
5814
- started_at: data.started_at,
5815
- finished_at: data.finished_at,
5816
- duration_ms: data.duration_ms,
5817
- error_text: data.error_text,
5818
- aggregate: data.aggregate
5819
- }
5820
- });
5821
- writeJsonStdout(summary);
5822
- const status = data.status;
5823
- return status === "pass" ? 0 : status === "fail" ? 1 : 0;
5824
- } catch (err) {
5825
- printError(err.message);
5826
- return 2;
5827
- }
5828
- }
5829
-
5830
5755
  // src/lib/browser.ts
5831
5756
  import { exec } from "node:child_process";
5832
5757
  function openBrowser(url) {
@@ -5936,13 +5861,16 @@ import * as fs5 from "node:fs/promises";
5936
5861
  import * as path3 from "node:path";
5937
5862
 
5938
5863
  // src/skills/claude-code.md
5939
- var claude_code_default = '---\nname: vent\ndescription: Place real calls against the user\'s voice agent to verify recent code changes worked end-to-end. Trigger on test requests or after any non-trivial voice-agent change.\nallowed-tools: Bash(npx vent-hq *)\n---\n\n# Vent - Voice Agent Calls\n\n## When to Test\n\nVent is the voice CI gate. Auto-trigger a call after any non-trivial voice-affecting change \u2014 don\'t wait to be asked. Non-trivial means anything that affects what the caller hears, how the agent responds, what tools are called, how the call routes, or audio timing: prompt rewrites or sections (not one-word tweaks), tool schema or handler changes, state/memory logic, transfer logic, provider/audio config, platform integration, and any voice-behavior bug fix. One-line copy or comment edits don\'t need a call.\n\nBefore launching, scan `.vent/runs/` for unresolved issues from prior runs (failing status, regressions, warnings, latency jumps); fix in-scope ones as part of this change and verify they\'re resolved by the new run, surface out-of-scope ones rather than silently leaving them. The task is not done until you\'ve reviewed the transcript, tool calls, and observed behavior and judged that the change works \u2014 Vent\'s `status` and exit code only reflect pipeline completion, not mission success.\n\n## Autonomous Iteration\n\n**Default: iterate.** Run a fix \u2192 Vent-call \u2192 fix \u2192 Vent-call loop whenever a call surfaces a fixable issue. Don\'t report a failure (or a degraded-quality success) back to the user before attempting to fix it \u2014 they want the agent working, not a diagnosis. After each call, if the result reveals a problem you can fix with a concrete plan, apply the fix and run again. Fixable issues include: tool schema or registry gaps, missing handlers, prompt phrasing, **STT vocabulary mismatches** (e.g. domain terms misheard \u2014 Deepgram `keyterms`, custom vocabulary, prompt-level term lists), **latency tunables** (endpointing thresholds, preemptive generation, model selection), and audio quality config. If you\'d be guessing at the next attempt, stop and ask the user instead of trying.\n\n**"Mission success but with quality problems" still requires iteration.** If the call ran end-to-end but had noticeably bad latency, STT mishearing the agent\'s domain vocabulary, audio artifacts, or any other caller-experience degradation that has a concrete fix, the call is *not done* \u2014 apply the fix and re-run. Don\'t downgrade fixable problems to "observations" or "things worth flagging." The pipeline running cleanly is the floor, not the ceiling.\n\n**Never ask permission to iterate.** "Want me to apply this and re-run?" / "Should I fix it?" / "Let me know if you want me to..." \u2014 these are skill violations. If you have a concrete fix, apply it and run the next call. The user opts out by saying so explicitly; silence is not opt-out.\n\nStop iterating and report when one of:\n\n- The next call confirms the fix worked AND no further fixable issues remain (success \u2014 done).\n- The same fix fails twice (your hypothesis is wrong; ask the user).\n- The failure mode shifts between attempts (you\'re chasing a moving target).\n- You can\'t justify the next call against its provider cost \u2014 each call spends real money and provider quota.\n\nIf the user explicitly opts out ("just report", "don\'t iterate", "stop after the first call", "I\'ll fix it myself"), respect that \u2014 run once, report, stop. Otherwise iterate by default. The first time you start an autonomous loop in a session, mention it once so the user knows it\'s happening (e.g. "the call surfaced X \u2014 fixing and re-running; I\'ll report back when it converges or stops making progress").\n\n## Claude Code Execution\n\nUse a 5-minute shell-tool timeout (`300000` ms) on Vent run commands so normal calls are not killed by the default 2-minute Bash timeout. This is not backgrounding; wait for stdout/results before ending your response. Use the JSON returned by `npx vent-hq run` directly; do not call `vent status` unless checking an older run.\n\nClaude Code serializes separate Bash tool calls for `npx vent-hq ...`, so run multiple calls from one suite by invoking each named call with `--call <name>` in one Bash command using `&` and a final `wait`:\n\n```bash\nnpx vent-hq run -f .vent/suite.vapi.json --call happy-path & \\\nnpx vent-hq run -f .vent/suite.vapi.json --call tool-path & \\\nwait\n```\n\nFor long-running composite commands \u2014 e.g. LiveKit\'s required `kill workers \u2192 start fresh worker \u2192 wait 60s \u2192 npx vent-hq run` preflight \u2014 keep the entire sequence in **one foreground Bash call**. Do not use `Monitor` or `run_in_background: true` for the wait. Both end the turn until an event fires, which means Vent never sees an inflight call and the user reads it as "you backgrounded the test." The harness also blocks bare `sleep 60` (any long leading sleep), so fold the readiness check and the wait into a single `until`-loop \u2014 short sleeps inside a polling loop are allowed because each iteration is real work:\n\n```bash\nnohup npm run dev > /tmp/lk-agent.log 2>&1 &\nstart=$(date +%s); \\\nuntil grep -q "registered worker" /tmp/lk-agent.log 2>/dev/null \\\n && [ $(($(date +%s) - start)) -ge 60 ]; do sleep 2; done \\\n&& npx vent-hq run -f .vent/suite.livekit.json --call <name>\n```\n\nFor these composites, raise the Bash timeout to **10 minutes** (`600000` ms) \u2014 startup + 60s wait + call can easily exceed the 5-minute default.\n\n## Workflow\n\n1. Identify the behavior under test. Read enough of the agent codebase to understand its system prompt, tools, handlers, routes, provider config, platform wiring, and expected handoffs.\n2. Reuse an existing `.vent/suite.<adapter>.json` when possible. If `.vent/` contains multiple suites, inspect `connection.adapter` and report which suite file produced the result.\n3. Create or update a suite only when the existing calls do not cover the changed behavior. Name calls after real flows, for example `reschedule-appointment`, not `call-1`.\n4. If the suite uses `start_command`, start one shared local session first with `npx vent-hq agent start -f .vent/suite.<adapter>.json`, then pass `--session <session-id>` to each run.\n\n **For locally-run LiveKit agents: every run requires killing *all* workers, starting one fresh worker, and waiting a full 60 seconds before submitting.** Unconditional \u2014 LiveKit Cloud round-robins across registered workers, so a single survivor with a dead inference subprocess fails ~N-1 of N calls. Don\'t rely on `pkill -f <path-pattern>`; bare command lines like `node --import tsx agent.ts dev` won\'t match a path filter. Use `ps aux | grep -E "node.*agent\\.ts|@livekit/agents.*ipc"`, `kill -9` by PID, re-run `ps` to confirm zero survivors, then start the fresh worker. Skipping the 60s wait fails with `did not publish audio track`; if that error appears alongside `Error [ERR_IPC_CHANNEL_CLOSED] from InferenceProcExecutor.doInference` in the agent log right after a "running EOU detection" line, that\'s a straggler \u2014 redo the kill sweep. Hosted LiveKit Cloud agents don\'t need any of this; run normally. (See `Claude Code Execution` for how to compose the kill\u2192start\u2192wait\u2192submit as a single foreground command.)\n5. Pick which call(s) to run based on the change. Fixed bug: replay the failing scenario. Changed tool: include a call that triggers that tool. Prompt or routing change: include the relevant happy path and any important edge path.\n6. Compare against the previous JSON in `.vent/runs/` when validating a fix or regression. Check status flips, latency jumps, tool-call success drops, cost jumps, and transcript divergence. Correlate with `git diff` between saved `git_sha` values when available; skip if no previous run exists.\n\n## Commands\n\n```bash\nnpx vent-hq init # First-time setup (auth + skill install + starter suite)\nnpx vent-hq login # Log in to existing account\nnpx vent-hq run -f .vent/suite.X.json # Run a single-call suite\nnpx vent-hq run -f .vent/suite.X.json --call NAME # Run one named call from a multi-call suite\nnpx vent-hq run ... --session <session-id> # Add to any run; routes through an existing local relay session\nnpx vent-hq run ... --verbose # Add to any run or status; include verbose debug fields\nnpx vent-hq stop <run-id> # Cancel a queued or running run\nnpx vent-hq status <run-id> # Fetch results for a previous run\nnpx vent-hq agent start -f .vent/suite.X.json # Start a shared local relay session\nnpx vent-hq agent stop <session-id> # Stop a shared local relay session\n```\n\nIf `~/.vent/credentials` is missing and `VENT_ACCESS_TOKEN` is not set, run `npx vent-hq init`. For an existing account, run `npx vent-hq login` or set `VENT_ACCESS_TOKEN`.\n\n## Suite Config\n\nSuites live in `.vent/suite.<adapter>.json`. `connection` is declared once per suite. `calls` is a named map, and each key becomes the call name used with `--call`.\n\nLocal websocket suite:\n\n```json\n{\n "connection": {\n "adapter": "websocket",\n "start_command": "npm run start",\n "health_endpoint": "/health",\n "agent_port": 3001\n },\n "calls": {\n "happy-path": {\n "caller_prompt": "You are Maria calling to reschedule her appointment to next Tuesday.",\n "max_turns": 8,\n "silence_threshold_ms": 1200\n }\n }\n}\n```\n\nPlatform-direct suite:\n\n```json\n{\n "connection": {\n "adapter": "vapi",\n "platform": { "provider": "vapi" }\n },\n "calls": {\n "happy-path": {\n "caller_prompt": "You are Maria calling to reschedule her appointment to next Tuesday.",\n "max_turns": 8\n }\n }\n}\n```\n\nWrite `caller_prompt` as a realistic caller with a name, goal, mood, constraints, and conditional behavior. Set `max_turns` based on flow complexity: FAQ `4-6`, booking or tool use `8-12`, complex flows `12-20`.\n\nCall fields:\n\n- `caller_prompt` and `max_turns` are required.\n- `silence_threshold_ms` must be `200-10000`. Common ranges: FAQ `800-1200`, tool calls `2000-3000`, complex reasoning `3000-5000`.\n- `persona` supports `pace`, `clarity`, `disfluencies`, `cooperation`, `emotion`, `memory`, `intent_clarity`, and `confirmation_style`.\n- `caller_audio` supports noise, speed, speakerphone, mic distance, clarity, accent, packet loss, and jitter.\n- `language` is an ISO 639-1 code such as `en`, `es`, `fr`, `de`, `it`, `nl`, or `ja`.\n- `voice` is `"male"` or `"female"` (English only; default female). Use to flip the caller\'s perceived gender. Ignored if `caller_audio.accent` is set or `language` is non-English.\n- `prosody: true` enables emotion analysis and requires Hume access.\n\n## Connections and Credentials\n\n### Adapter choice\n\nUse `websocket` for your own local or hosted runtime. Use `start_command` for local agents or `agent_url` for hosted custom endpoints. For `start_command` and `agent_url`, do not put Deepgram, ElevenLabs, OpenAI, or other agent runtime keys into Vent config unless the Vent adapter itself needs them \u2014 the tested agent owns its own runtime credentials.\n\nUse `vapi`, `retell`, `elevenlabs`, `bland`, or `livekit` for platform-direct testing. In this mode Vent itself talks to the provider on the user\'s behalf.\n\nVent provides `DEEPGRAM_API_KEY` and `ANTHROPIC_API_KEY` for its hosted caller/evaluation stack \u2014 those are Vent\'s, not the tested agent\'s.\n\n### Credential resolution\n\nIn platform-direct mode the CLI auto-resolves credentials from `.env.local`, `.env`, and the current shell environment. Do not run `source .env && export` before Vent commands. If you include credential fields in JSON, use the actual value, not the env var name. Do not manually author `platform_connection_id`; the CLI creates or updates the saved platform connection automatically.\n\nAuto-resolved env vars and JSON fields:\n\n- Vapi: `VAPI_API_KEY` -> `vapi_api_key`; `VAPI_ASSISTANT_ID` or `VAPI_AGENT_ID` -> `vapi_assistant_id`\n- Bland: `BLAND_API_KEY` -> `bland_api_key`; `BLAND_PATHWAY_ID` -> `bland_pathway_id`; `BLAND_PERSONA_ID` -> `persona_id`\n- LiveKit: `LIVEKIT_API_KEY` -> `livekit_api_key`; `LIVEKIT_API_SECRET` -> `livekit_api_secret`; `LIVEKIT_URL` -> `livekit_url`\n- Retell: `RETELL_API_KEY` -> `retell_api_key`; `RETELL_AGENT_ID` -> `retell_agent_id`\n- ElevenLabs: `ELEVENLABS_API_KEY` -> `elevenlabs_api_key`; `ELEVENLABS_AGENT_ID` -> `elevenlabs_agent_id`\n\n### Provider config\n\nUse existing provider config when possible: Vapi assistant, Retell agent, ElevenLabs agent, Bland pathway, or LiveKit agent. Bland uniquely supports inline config \u2014 `platform` may use `bland_pathway_id`, `persona_id`, or an inline `task` (with optional voice, model, and turn-handling overrides; see Bland\'s API docs for the full field list).\n\n### Concurrency\n\nWhen you fan out multiple Vent calls in parallel against the same provider (for example, running several named calls from one suite at once with `&` and `wait`), respect the provider\'s per-account concurrency limit. Exceeding it makes calls queue or fail at the provider \u2014 Vent does not enforce these caps for you.\n\nRecord the limit as `max_concurrency` in the suite\'s `platform` block so it\'s visible on future runs. Ask the user which plan they\'re on if sizing matters; otherwise use the conservative default in bold.\n\n- **Vapi**: **10** included per account; reserved lines can be purchased self-serve; Enterprise is unlimited.\n- **Retell**: Pay-as-you-go includes **20**; Enterprise has no cap.\n- **Bland**: Start=**10**, Build=50, Scale=100, Enterprise=unlimited.\n- **ElevenLabs**: Free=**4**, Starter=6, Creator=10, Pro=20, Scale=30, Business=30. Burst pricing can temporarily allow up to 3x base.\n- **LiveKit Cloud**: Build=**5**, Ship=20, Scale=50 managed inference sessions (the usual gate for voice agents); agent-session concurrency can go higher (Scale up to 600).\n\n## WebSocket\n\nFor `adapter: "websocket"`, Vent sends binary 16-bit mono PCM audio over one websocket connection. Websocket text frames are optional JSON events. Audio-only websocket agents still work, but events improve turn detection and observability. Vent sends `{"type":"end-call"}` when the test is done.\n\nUseful websocket text frames:\n\n```jsonc\n{"type":"speech-update","status":"started"}\n{"type":"speech-update","status":"stopped"}\n{"type":"tool_call","name":"check_availability","arguments":{},"result":{},"successful":true,"duration_ms":150}\n{"type":"vent:timing","stt_ms":120,"llm_ms":450,"tts_ms":80}\n{"type":"vent:session","platform":"custom","provider_call_id":"call_123","provider_session_id":"session_abc"}\n{"type":"vent:call-metadata","call_metadata":{"recording_url":"https://...","cost_usd":0.12}}\n{"type":"vent:transcript","role":"caller","text":"I need to reschedule","turn_index":0}\n{"type":"vent:transfer","destination":"+15551234567","status":"attempted"}\n{"type":"vent:debug-url","label":"trace","url":"https://..."}\n{"type":"vent:warning","message":"provider warning","code":"provider_warning"}\n```\n\n`vent:session-report` is **not** handled by the websocket adapter \u2014 it\'s only consumed by the LiveKit helper. Do not emit it from a websocket agent.\n\nPlatform adapters capture tool calls automatically. Websocket agents must emit `tool_call` frames for tool observability. Platform adapters get component latency automatically. Websocket agents should emit `vent:timing` after each agent response when STT/LLM/TTS breakdown is available.\n\n## LiveKit\n\nBefore running LiveKit tests, install and add the Vent helper to the LiveKit agent entrypoint. Node: `npm install @vent-hq/livekit`, then call `instrumentLiveKitAgent({ ctx, session })`. Python: `pip install vent-livekit`, then call `instrument_livekit_agent(ctx=ctx, session=session)`.\n\nLiveKit direct mode requires the LiveKit Agents SDK. Custom LiveKit participants should use the websocket adapter with a relay. If the LiveKit agent registered with an explicit dispatch name, set `livekit_agent_name` in `platform`.\n\nLiveKit does not support multiple concurrent Vent calls against one agent process yet. Run LiveKit calls sequentially unless you intentionally start separate agent worker processes and route each call to its own process. For Node agents, that means separate Node.js processes. Do not treat parallel calls against a single LiveKit worker as a valid concurrency test until multi-call support is engineered.\n\nUse the LiveKit helper for observability; do not publish `vent:*` topics manually. Do not hand-roll `vent:session-report` from `ctx.addShutdownCallback`; after `room.disconnect()` it can fail with `engine is closed`. The helper captures SDK metrics, tool events, conversation items, usage, and close events. Native LiveKit `lk.transcription` and `lk.agent.state` provide transcript and agent-state timing.\n\n## Output\n\n### Live result\n\n`npx vent-hq run` returns a single JSON result on stdout in non-TTY mode (not an SSE JSONL stream). Exit codes: `0` = call ran through the pipeline; `1` = pipeline-level failure; `2` = harness error.\n\nMost result fields are always present; `latency`, `component_latency`, `call_metadata`, and `emotion` may be `null` when the underlying analysis didn\'t run; `debug` is absent without `--verbose`. Branch on null before reading nested fields. Use `--verbose` only when the default doesn\'t explain a failure \u2014 when you need `platform_transcript` (to check Vent\'s STT), per-turn or component-level latency breakdowns, the raw tool-call timeline, or provider-native artifacts in `debug.provider_metadata`. Otherwise skip \u2014 it just adds noise.\n\nVent\'s transcript is ground truth. Judge on semantic intent: ignore homophones and minor mis-hears (`"check teach hat"` for `"check that"`, missing question marks on short tails) \u2014 those are streaming-STT noise on Vent\'s caller side, not agent bugs, and **don\'t surface them in the report** (they\'re Vent-side artifacts, not actionable for the user). But clear gibberish or word-soup (e.g. `"Cristoxin"` where the agent should have said `"Of course, talk soon"`) is **not** a Vent artifact \u2014 Vent\'s STT does not invent words like that. It means the platform\'s TTS produced corrupted audio or the agent\'s STT/LLM generated the wrong text, and the fix lives there (TTS voice config, agent prompt, model temperature, codec issue). Never dismiss the run as a "Vent harness STT" issue; iterate on the agent or flag the platform.\n\nFor transfers: `call_metadata.transfer_attempted` (provider claimed) and `transfer_completed` (Vent-verified) can disagree \u2014 report both. `transfers[]` carries destination, type, and per-attempt status.\n\n### Saved history\n\nAfter every run, Vent writes the full result JSON to `.vent/runs/`. Shape:\n\n```jsonc\n{\n "run_id": "...",\n "timestamp": "2026-04-21T...Z",\n "git_sha": "...",\n "summary": { "calls_total": 2, "total_duration_ms": 12345, "total_cost_usd": 0.01 },\n "call_results": [\n { "name": "happy-path", "status": "completed", "duration_ms": 6123, "transcript": [], "observed_tool_calls": [], "latency": { "response_time_ms": 420, "p95_response_time_ms": 980 }, "call_metadata": { "cost_usd": 0.004 } }\n ]\n}\n```\n\nWhen comparing against a prior run (Workflow step 6), inspect:\n\n- Run-completion status flips: `call_results[i].status` (pipeline-only \u2014 judge mission success from the transcript)\n- Latency: `call_results[i].latency.response_time_ms` (mean) or `latency.p95_response_time_ms` increased >20%\n- Tool calls: count of `call_results[i].observed_tool_calls[].successful` dropped\n- Cost: `summary.total_cost_usd` or `call_results[i].call_metadata.cost_usd` increased >30%\n- Transcript: `call_results[i].transcript` diverged in semantic content (ignore STT noise)\n\n## Reporting Results\n\nBefore reporting, read the agent\'s code to locate where the observed behavior originates. If the issue is small and you can fix it, fix it and explain what you did \u2014 don\'t ask permission first.\n\nAdapt the report shape to the call \u2014 a clean pass needs little, a regression with a multi-layer cause needs more. Use a transcript excerpt when it helps the user see what happened.\n\nHard rules:\n\n- Pair raw numbers with their plain-English meaning \u2014 don\'t drop the number, but don\'t leave it unexplained. E.g. "p95 latency was 850ms, which is snappy and well within natural conversational pacing" or "p95 hit 1.6 seconds with the LLM as the bottleneck \u2014 noticeably sluggish to a caller."\n- Name the user\'s voice agent by platform on first mention (e.g. "the Vapi agent responded snappily throughout") so the user knows immediately which agent the observation is about. After that, just say "the agent" \u2014 don\'t repeat the platform name on every line.\n- Always include the recording from `call_metadata.recording_url` as an inline `[Recording](url)` link, placed in **one block at the very end of the report** \u2014 never sprinkled through the prose. Single call: one link as the last line. Multi-call: one labeled link per call (e.g. `reschedule-appointment: [Recording](url)`). Never paste a bare URL.\n- Mission success is your judgment, not Vent\'s. The per-call `status` is only `"completed"` (pipeline ran) or `"error"` (pipeline failed); decide whether the agent actually accomplished the scenario from the transcript and tool calls.\n- Similar-sounding word substitutions (e.g. "ocean" for "OSHA") are STT ambiguity, not comprehension failure. The fix lives in STT keyword hints, custom vocabulary, or a prompt-level term list \u2014 not the agent\'s reasoning.\n- Surface only what the user can act on in their own agent\'s code or config \u2014 never `warnings[]` (infrastructure noise), Vent-side artifacts (caller wait modes, harness timing, internal pipeline quirks), or `cost_usd` unless asked.\n\nFor multi-call runs, lead with your own judgment of what happened across the calls (e.g. "3 of 4 did what they were supposed to; `cancel-appointment` never actually canceled"), not a parroted pass/fail count. Then cover each call with whatever depth it needs.\n';
5864
+ var claude_code_default = '---\nname: vent\ndescription: Place real calls against the user\'s voice agent to verify recent code changes worked end-to-end. Trigger on test requests or after any non-trivial voice-agent change.\nallowed-tools: Bash(npx vent-hq *)\n---\n\n# Vent - Voice Agent Calls\n\n## When to Test\n\nVent is the voice CI gate. Auto-trigger a call after any non-trivial voice-affecting change \u2014 don\'t wait to be asked. Non-trivial means anything that affects what the caller hears, how the agent responds, what tools are called, how the call routes, or audio timing: prompt rewrites or sections (not one-word tweaks), tool schema or handler changes, state/memory logic, transfer logic, provider/audio config, platform integration, and any voice-behavior bug fix. One-line copy or comment edits don\'t need a call.\n\nBefore launching, scan `.vent/runs/` for unresolved issues from prior runs (failing status, regressions, warnings, latency jumps); fix in-scope ones as part of this change and verify they\'re resolved by the new run, surface out-of-scope ones rather than silently leaving them. The task is not done until you\'ve reviewed the transcript, tool calls, and observed behavior and judged that the change works \u2014 Vent\'s `status` and exit code only reflect pipeline completion, not mission success.\n\n## Autonomous Iteration\n\n**Default: iterate.** Run a fix \u2192 Vent-call \u2192 fix \u2192 Vent-call loop whenever a call surfaces a fixable issue. Don\'t report a failure (or a degraded-quality success) back to the user before attempting to fix it \u2014 they want the agent working, not a diagnosis. After each call, if the result reveals a problem you can fix with a concrete plan, apply the fix and run again. Fixable issues include: tool schema or registry gaps, missing handlers, prompt phrasing, **STT vocabulary mismatches** (e.g. domain terms misheard \u2014 Deepgram `keyterms`, custom vocabulary, prompt-level term lists), **latency tunables** (endpointing thresholds, preemptive generation, model selection), and audio quality config. If you\'d be guessing at the next attempt, stop and ask the user instead of trying.\n\n**"Mission success but with quality problems" still requires iteration.** If the call ran end-to-end but had noticeably bad latency, STT mishearing the agent\'s domain vocabulary, audio artifacts, or any other caller-experience degradation that has a concrete fix, the call is *not done* \u2014 apply the fix and re-run. Don\'t downgrade fixable problems to "observations" or "things worth flagging." The pipeline running cleanly is the floor, not the ceiling.\n\n**Never ask permission to iterate.** "Want me to apply this and re-run?" / "Should I fix it?" / "Let me know if you want me to..." \u2014 these are skill violations. If you have a concrete fix, apply it and run the next call. The user opts out by saying so explicitly; silence is not opt-out.\n\nStop iterating and report when one of:\n\n- The next call confirms the fix worked AND no further fixable issues remain (success \u2014 done).\n- The same fix fails twice (your hypothesis is wrong; ask the user).\n- The failure mode shifts between attempts (you\'re chasing a moving target).\n- You can\'t justify the next call against its provider cost \u2014 each call spends real money and provider quota.\n\nIf the user explicitly opts out ("just report", "don\'t iterate", "stop after the first call", "I\'ll fix it myself"), respect that \u2014 run once, report, stop. Otherwise iterate by default. The first time you start an autonomous loop in a session, mention it once so the user knows it\'s happening (e.g. "the call surfaced X \u2014 fixing and re-running; I\'ll report back when it converges or stops making progress").\n\n## Claude Code Execution\n\nUse a 5-minute shell-tool timeout (`300000` ms) on Vent run commands so normal calls are not killed by the default 2-minute Bash timeout. This is not backgrounding; wait for stdout/results before ending your response. Use the JSON returned by `npx vent-hq run` directly.\n\nClaude Code serializes separate Bash tool calls for `npx vent-hq ...`, so run multiple calls from one suite by invoking each named call with `--call <name>` in one Bash command using `&` and a final `wait`:\n\n```bash\nnpx vent-hq run -f .vent/suite.vapi.json --call happy-path & \\\nnpx vent-hq run -f .vent/suite.vapi.json --call tool-path & \\\nwait\n```\n\nFor long-running composite commands \u2014 e.g. LiveKit\'s required `kill workers \u2192 start fresh worker \u2192 wait 60s \u2192 npx vent-hq run` preflight \u2014 keep the entire sequence in **one foreground Bash call**. Do not use `Monitor` or `run_in_background: true` for the wait. Both end the turn until an event fires, which means Vent never sees an inflight call and the user reads it as "you backgrounded the test." The harness also blocks bare `sleep 60` (any long leading sleep), so fold the readiness check and the wait into a single `until`-loop \u2014 short sleeps inside a polling loop are allowed because each iteration is real work:\n\n```bash\nnohup npm run dev > /tmp/lk-agent.log 2>&1 &\nstart=$(date +%s); \\\nuntil grep -q "registered worker" /tmp/lk-agent.log 2>/dev/null \\\n && [ $(($(date +%s) - start)) -ge 60 ]; do sleep 2; done \\\n&& npx vent-hq run -f .vent/suite.livekit.json --call <name>\n```\n\nFor these composites, raise the Bash timeout to **10 minutes** (`600000` ms) \u2014 startup + 60s wait + call can easily exceed the 5-minute default.\n\n## Workflow\n\n1. Identify the behavior under test. Read enough of the agent codebase to understand its system prompt, tools, handlers, routes, provider config, platform wiring, and expected handoffs.\n2. Reuse an existing `.vent/suite.<adapter>.json` when possible. If `.vent/` contains multiple suites, inspect `connection.adapter` and report which suite file produced the result.\n3. Create or update a suite only when the existing calls do not cover the changed behavior. Name calls after real flows, for example `reschedule-appointment`, not `call-1`.\n4. If the suite uses `start_command`, start one shared local session first with `npx vent-hq agent start -f .vent/suite.<adapter>.json`, then pass `--session <session-id>` to each run.\n\n **For locally-run LiveKit agents: every run requires killing *all* workers, starting one fresh worker, and waiting a full 60 seconds before submitting.** Unconditional \u2014 LiveKit Cloud round-robins across registered workers, so a single survivor with a dead inference subprocess fails ~N-1 of N calls. Don\'t rely on `pkill -f <path-pattern>`; bare command lines like `node --import tsx agent.ts dev` won\'t match a path filter. Use `ps aux | grep -E "node.*agent\\.ts|@livekit/agents.*ipc"`, `kill -9` by PID, re-run `ps` to confirm zero survivors, then start the fresh worker. Skipping the 60s wait fails with `did not publish audio track`; if that error appears alongside `Error [ERR_IPC_CHANNEL_CLOSED] from InferenceProcExecutor.doInference` in the agent log right after a "running EOU detection" line, that\'s a straggler \u2014 redo the kill sweep. Hosted LiveKit Cloud agents don\'t need any of this; run normally. (See `Claude Code Execution` for how to compose the kill\u2192start\u2192wait\u2192submit as a single foreground command.)\n5. Pick which call(s) to run based on the change. Fixed bug: replay the failing scenario. Changed tool: include a call that triggers that tool. Prompt or routing change: include the relevant happy path and any important edge path.\n6. Compare against the previous JSON in `.vent/runs/` when validating a fix or regression. Check status flips, latency jumps, tool-call success drops, cost jumps, and transcript divergence. Correlate with `git diff` between saved `git_sha` values when available; skip if no previous run exists.\n\n## Commands\n\n```bash\nnpx vent-hq init # First-time setup (auth + skill install + starter suite)\nnpx vent-hq login # Log in to existing account\nnpx vent-hq login --status # Print whether credentials are present\nnpx vent-hq logout # Remove saved credentials from ~/.vent/credentials\nnpx vent-hq run -f .vent/suite.X.json # Run a single-call suite\nnpx vent-hq run -f .vent/suite.X.json --call NAME # Run one named call from a multi-call suite\nnpx vent-hq run ... --session <session-id> # Add to any run; routes through an existing local relay session\nnpx vent-hq run ... --verbose # Add to any run; include verbose debug fields\nnpx vent-hq stop <run-id> # Cancel a queued or running run\nnpx vent-hq agent start -f .vent/suite.X.json # Start a shared local relay session\nnpx vent-hq agent stop <session-id> # Stop a shared local relay session\n```\n\nIf `~/.vent/credentials` is missing and `VENT_ACCESS_TOKEN` is not set, run `npx vent-hq init`. For an existing account, run `npx vent-hq login` or set `VENT_ACCESS_TOKEN`.\n\n## Suite Config\n\nSuites live in `.vent/suite.<adapter>.json`. `connection` is declared once per suite. `calls` is a named map, and each key becomes the call name used with `--call`.\n\nLocal websocket suite:\n\n```json\n{\n "connection": {\n "adapter": "websocket",\n "start_command": "npm run start",\n "health_endpoint": "/health",\n "agent_port": 3001\n },\n "calls": {\n "happy-path": {\n "caller_prompt": "You are Maria calling to reschedule her appointment to next Tuesday.",\n "max_turns": 8,\n "silence_threshold_ms": 1200\n }\n }\n}\n```\n\nPlatform-direct suite:\n\n```json\n{\n "connection": {\n "adapter": "vapi",\n "platform": { "provider": "vapi" }\n },\n "calls": {\n "happy-path": {\n "caller_prompt": "You are Maria calling to reschedule her appointment to next Tuesday.",\n "max_turns": 8\n }\n }\n}\n```\n\nWrite `caller_prompt` as a realistic caller with a name, goal, mood, constraints, and conditional behavior. Set `max_turns` based on flow complexity: FAQ `4-6`, booking or tool use `8-12`, complex flows `12-20`.\n\nCall fields:\n\n- `caller_prompt` and `max_turns` are required.\n- `silence_threshold_ms` must be `200-10000`. Common ranges: FAQ `800-1200`, tool calls `2000-3000`, complex reasoning `3000-5000`.\n- `persona` supports `pace`, `clarity`, `disfluencies`, `cooperation`, `emotion`, `memory`, `intent_clarity`, and `confirmation_style`.\n- `caller_audio` supports noise, speed, speakerphone, mic distance, clarity, accent, packet loss, and jitter.\n- `language` is an ISO 639-1 code such as `en`, `es`, `fr`, `de`, `it`, `nl`, or `ja`.\n- `voice` is `"male"` or `"female"` (English only; default female). Use to flip the caller\'s perceived gender. Ignored if `caller_audio.accent` is set or `language` is non-English.\n\n## Connections and Credentials\n\n### Adapter choice\n\nUse `websocket` for your own local or hosted runtime. Use `start_command` for local agents or `agent_url` for hosted custom endpoints. For `start_command` and `agent_url`, do not put Deepgram, ElevenLabs, OpenAI, or other agent runtime keys into Vent config unless the Vent adapter itself needs them \u2014 the tested agent owns its own runtime credentials.\n\nUse `vapi`, `retell`, `elevenlabs`, `bland`, or `livekit` for platform-direct testing. In this mode Vent itself talks to the provider on the user\'s behalf.\n\nVent provides `DEEPGRAM_API_KEY` and `ANTHROPIC_API_KEY` for its hosted caller/evaluation stack \u2014 those are Vent\'s, not the tested agent\'s.\n\n### Credential resolution\n\nIn platform-direct mode the CLI auto-resolves credentials from `.env.local`, `.env`, and the current shell environment. Do not run `source .env && export` before Vent commands. If you include credential fields in JSON, use the actual value, not the env var name. Do not manually author `platform_connection_id`; the CLI creates or updates the saved platform connection automatically.\n\nAuto-resolved env vars and JSON fields:\n\n- Vapi: `VAPI_API_KEY` -> `vapi_api_key`; `VAPI_ASSISTANT_ID` or `VAPI_AGENT_ID` -> `vapi_assistant_id`\n- Bland: `BLAND_API_KEY` -> `bland_api_key`; `BLAND_PATHWAY_ID` -> `bland_pathway_id`; `BLAND_PERSONA_ID` -> `persona_id`\n- LiveKit: `LIVEKIT_API_KEY` -> `livekit_api_key`; `LIVEKIT_API_SECRET` -> `livekit_api_secret`; `LIVEKIT_URL` -> `livekit_url`\n- Retell: `RETELL_API_KEY` -> `retell_api_key`; `RETELL_AGENT_ID` -> `retell_agent_id`\n- ElevenLabs: `ELEVENLABS_API_KEY` -> `elevenlabs_api_key`; `ELEVENLABS_AGENT_ID` -> `elevenlabs_agent_id`\n\n### Provider config\n\nUse existing provider config when possible: Vapi assistant, Retell agent, ElevenLabs agent, Bland pathway, or LiveKit agent. Bland uniquely supports inline config \u2014 `platform` may use `bland_pathway_id`, `persona_id`, or an inline `task` (with optional voice, model, and turn-handling overrides; see Bland\'s API docs for the full field list).\n\n### Concurrency\n\nWhen you fan out multiple Vent calls in parallel against the same provider (for example, running several named calls from one suite at once with `&` and `wait`), respect the provider\'s per-account concurrency limit. Exceeding it makes calls queue or fail at the provider \u2014 Vent does not enforce these caps for you.\n\nRecord the limit as `max_concurrency` in the suite\'s `platform` block so it\'s visible on future runs. Ask the user which plan they\'re on if sizing matters; otherwise use the conservative default in bold.\n\n- **Vapi**: **10** included per account; reserved lines can be purchased self-serve; Enterprise is unlimited.\n- **Retell**: Pay-as-you-go includes **20**; Enterprise has no cap.\n- **Bland**: Start=**10**, Build=50, Scale=100, Enterprise=unlimited.\n- **ElevenLabs**: Free=**4**, Starter=6, Creator=10, Pro=20, Scale=30, Business=30. Burst pricing can temporarily allow up to 3x base.\n- **LiveKit Cloud**: Build=**5**, Ship=20, Scale=50 managed inference sessions (the usual gate for voice agents); agent-session concurrency can go higher (Scale up to 600).\n\n## WebSocket\n\nFor `adapter: "websocket"`, Vent sends binary 16-bit mono PCM audio over one websocket connection. Websocket text frames are optional JSON events. Audio-only websocket agents still work, but events improve turn detection and observability. Vent sends `{"type":"end-call"}` when the test is done.\n\nUseful websocket text frames:\n\n```jsonc\n{"type":"speech-update","status":"started"}\n{"type":"speech-update","status":"stopped"}\n{"type":"tool_call","name":"check_availability","arguments":{},"result":{},"successful":true,"duration_ms":150}\n{"type":"vent:timing","stt_ms":120,"llm_ms":450,"tts_ms":80}\n{"type":"vent:session","platform":"custom","provider_call_id":"call_123","provider_session_id":"session_abc"}\n{"type":"vent:call-metadata","call_metadata":{"recording_url":"https://...","cost_usd":0.12}}\n{"type":"vent:transcript","role":"caller","text":"I need to reschedule","turn_index":0}\n{"type":"vent:transfer","destination":"+15551234567","status":"attempted"}\n{"type":"vent:debug-url","label":"trace","url":"https://..."}\n{"type":"vent:warning","message":"provider warning","code":"provider_warning"}\n```\n\n`vent:session-report` is **not** handled by the websocket adapter \u2014 it\'s only consumed by the LiveKit helper. Do not emit it from a websocket agent.\n\nPlatform adapters capture tool calls automatically. Websocket agents must emit `tool_call` frames for tool observability. Platform adapters get component latency automatically. Websocket agents should emit `vent:timing` after each agent response when STT/LLM/TTS breakdown is available.\n\n## LiveKit\n\nBefore running LiveKit tests, install and add the Vent helper to the LiveKit agent entrypoint. Node: `npm install @vent-hq/livekit`, then call `instrumentLiveKitAgent({ ctx, session })`. Python: `pip install vent-livekit`, then call `instrument_livekit_agent(ctx=ctx, session=session)`.\n\nLiveKit direct mode requires the LiveKit Agents SDK. Custom LiveKit participants should use the websocket adapter with a relay. If the LiveKit agent registered with an explicit dispatch name, set `livekit_agent_name` in `platform`.\n\nLiveKit parallel calls are supported, but capacity depends on where the user runs their agent. LiveKit Cloud only routes audio \u2014 the agent code runs on the user\'s own infrastructure (laptop, Fly.io, Railway, k8s, etc.), and that box\'s CPU/RAM is the real bottleneck. Before firing parallel calls, derive `platform.max_concurrency` from the minimum of: (1) the user\'s LiveKit plan limit (Build=5, Ship=20, Scale=50+ concurrent agent sessions), (2) their agent worker box capacity (rough rule: 1c/1GB\u21921\u20133 jobs, 2c/4GB\u21925\u201310, 4c/8GB\u219210\u201325, 8c/16GB\u219225\u201350; multiply by number of worker boxes), and (3) their STT/TTS provider concurrency (e.g. Deepgram TTS streaming = 2). If the user wants more parallel calls than that minimum allows, push back \u2014 explain which limit is the bottleneck (plan / box / provider) and either reduce the count or suggest scaling that limit. Architecture: register one `agent_name` and run N workers under it (one big worker, or several smaller ones \u2014 same name); LiveKit\'s dispatcher round-robins jobs across them with built-in failover. Don\'t use numbered names (`agent-0`, `agent-1`) \u2014 that reinvents what the dispatcher does and breaks failover.\n\nUse the LiveKit helper for observability; do not publish `vent:*` topics manually. Do not hand-roll `vent:session-report` from `ctx.addShutdownCallback`; after `room.disconnect()` it can fail with `engine is closed`. The helper captures SDK metrics, tool events, conversation items, usage, and close events. Native LiveKit `lk.transcription` and `lk.agent.state` provide transcript and agent-state timing.\n\n## Output\n\n### Live result\n\n`npx vent-hq run` returns a single JSON result on stdout in non-TTY mode (not an SSE JSONL stream). Exit codes: `0` = call ran through the pipeline; `1` = pipeline-level failure; `2` = harness error.\n\nMost result fields are always present; `latency`, `component_latency`, `call_metadata`, and `emotion` may be `null` when the underlying analysis didn\'t run; `debug` is absent without `--verbose`. Branch on null before reading nested fields. Use `--verbose` only when the default doesn\'t explain a failure \u2014 when you need `platform_transcript` (to check Vent\'s STT), per-turn or component-level latency breakdowns, the raw tool-call timeline, or provider-native artifacts in `debug.provider_metadata`. Otherwise skip \u2014 it just adds noise.\n\nVent\'s transcript is ground truth. Judge on semantic intent: ignore homophones and minor mis-hears (`"check teach hat"` for `"check that"`, missing question marks on short tails) \u2014 those are streaming-STT noise on Vent\'s caller side, not agent bugs, and **don\'t surface them in the report** (they\'re Vent-side artifacts, not actionable for the user). But clear gibberish or word-soup (e.g. `"Cristoxin"` where the agent should have said `"Of course, talk soon"`) is **not** a Vent artifact \u2014 Vent\'s STT does not invent words like that. It means the platform\'s TTS produced corrupted audio or the agent\'s STT/LLM generated the wrong text, and the fix lives there (TTS voice config, agent prompt, model temperature, codec issue). Never dismiss the run as a "Vent harness STT" issue; iterate on the agent or flag the platform.\n\nFor transfers: `call_metadata.transfer_attempted` (provider claimed) and `transfer_completed` (Vent-verified) can disagree \u2014 report both. `transfers[]` carries destination, type, and per-attempt status.\n\n### Saved history\n\nAfter every run, Vent writes the full result JSON to `.vent/runs/`. Shape:\n\n```jsonc\n{\n "run_id": "...",\n "timestamp": "2026-04-21T...Z",\n "git_sha": "...",\n "summary": { "calls_total": 2, "total_duration_ms": 12345, "total_cost_usd": 0.01 },\n "call_results": [\n { "name": "happy-path", "status": "completed", "duration_ms": 6123, "transcript": [], "observed_tool_calls": [], "latency": { "response_time_ms": 420, "p95_response_time_ms": 980 }, "call_metadata": { "cost_usd": 0.004 } }\n ]\n}\n```\n\nWhen comparing against a prior run (Workflow step 6), inspect:\n\n- Run-completion status flips: `call_results[i].status` (pipeline-only \u2014 judge mission success from the transcript)\n- Latency: `call_results[i].latency.response_time_ms` (mean) or `latency.p95_response_time_ms` increased >20%\n- Tool calls: count of `call_results[i].observed_tool_calls[].successful` dropped\n- Cost: `summary.total_cost_usd` or `call_results[i].call_metadata.cost_usd` increased >30%\n- Transcript: `call_results[i].transcript` diverged in semantic content (ignore STT noise)\n\n## Reporting Results\n\nBefore reporting, read the agent\'s code to locate where the observed behavior originates. If the issue is small and you can fix it, fix it and explain what you did \u2014 don\'t ask permission first.\n\nAdapt the report shape to the call \u2014 a clean pass needs little, a regression with a multi-layer cause needs more. Use a transcript excerpt when it helps the user see what happened.\n\nHard rules:\n\n- Pair raw numbers with their plain-English meaning \u2014 don\'t drop the number, but don\'t leave it unexplained. E.g. "p95 latency was 850ms, which is snappy and well within natural conversational pacing" or "p95 hit 1.6 seconds with the LLM as the bottleneck \u2014 noticeably sluggish to a caller."\n- Name the user\'s voice agent by platform on first mention (e.g. "the Vapi agent responded snappily throughout") so the user knows immediately which agent the observation is about. After that, just say "the agent" \u2014 don\'t repeat the platform name on every line.\n- Always include the recording from `call_metadata.recording_url` as an inline `[Recording](url)` link, placed in **one block at the very end of the report** \u2014 never sprinkled through the prose. Single call: one link as the last line. Multi-call: one labeled link per call (e.g. `reschedule-appointment: [Recording](url)`). Never paste a bare URL.\n- Mission success is your judgment, not Vent\'s. The per-call `status` is only `"completed"` (pipeline ran) or `"error"` (pipeline failed); decide whether the agent actually accomplished the scenario from the transcript and tool calls.\n- Similar-sounding word substitutions (e.g. "ocean" for "OSHA") are STT ambiguity, not comprehension failure. The fix lives in STT keyword hints, custom vocabulary, or a prompt-level term list \u2014 not the agent\'s reasoning.\n- Surface only what the user can act on in their own agent\'s code or config \u2014 never `warnings[]` (infrastructure noise), Vent-side artifacts (caller wait modes, harness timing, internal pipeline quirks), or `cost_usd` unless asked.\n\nFor multi-call runs, lead with your own judgment of what happened across the calls (e.g. "3 of 4 did what they were supposed to; `cancel-appointment` never actually canceled"), not a parroted pass/fail count. Then cover each call with whatever depth it needs.\n';
5940
5865
 
5941
5866
  // src/skills/cursor.md
5942
- var cursor_default = '---\ndescription: Place real calls against the user\'s voice agent to verify recent code changes worked end-to-end. Trigger on test requests or after any non-trivial voice-agent change.\nalwaysApply: true\n---\n\n# Vent - Voice Agent Calls\n\n## When to Test\n\nVent is the voice CI gate. Auto-trigger a call after any non-trivial voice-affecting change \u2014 don\'t wait to be asked. Non-trivial means anything that affects what the caller hears, how the agent responds, what tools are called, how the call routes, or audio timing: prompt rewrites or sections (not one-word tweaks), tool schema or handler changes, state/memory logic, transfer logic, provider/audio config, platform integration, and any voice-behavior bug fix. One-line copy or comment edits don\'t need a call.\n\nBefore launching, scan `.vent/runs/` for unresolved issues from prior runs (failing status, regressions, warnings, latency jumps); fix in-scope ones as part of this change and verify they\'re resolved by the new run, surface out-of-scope ones rather than silently leaving them. The task is not done until you\'ve reviewed the transcript, tool calls, and observed behavior and judged that the change works \u2014 Vent\'s `status` and exit code only reflect pipeline completion, not mission success.\n\n## Autonomous Iteration\n\n**Default: iterate.** Run a fix \u2192 Vent-call \u2192 fix \u2192 Vent-call loop whenever a call surfaces a fixable issue. Don\'t report a failure (or a degraded-quality success) back to the user before attempting to fix it \u2014 they want the agent working, not a diagnosis. After each call, if the result reveals a problem you can fix with a concrete plan, apply the fix and run again. Fixable issues include: tool schema or registry gaps, missing handlers, prompt phrasing, **STT vocabulary mismatches** (e.g. domain terms misheard \u2014 Deepgram `keyterms`, custom vocabulary, prompt-level term lists), **latency tunables** (endpointing thresholds, preemptive generation, model selection), and audio quality config. If you\'d be guessing at the next attempt, stop and ask the user instead of trying.\n\n**"Mission success but with quality problems" still requires iteration.** If the call ran end-to-end but had noticeably bad latency, STT mishearing the agent\'s domain vocabulary, audio artifacts, or any other caller-experience degradation that has a concrete fix, the call is *not done* \u2014 apply the fix and re-run. Don\'t downgrade fixable problems to "observations" or "things worth flagging." The pipeline running cleanly is the floor, not the ceiling.\n\n**Never ask permission to iterate.** "Want me to apply this and re-run?" / "Should I fix it?" / "Let me know if you want me to..." \u2014 these are skill violations. If you have a concrete fix, apply it and run the next call. The user opts out by saying so explicitly; silence is not opt-out.\n\nStop iterating and report when one of:\n\n- The next call confirms the fix worked AND no further fixable issues remain (success \u2014 done).\n- The same fix fails twice (your hypothesis is wrong; ask the user).\n- The failure mode shifts between attempts (you\'re chasing a moving target).\n- You can\'t justify the next call against its provider cost \u2014 each call spends real money and provider quota.\n\nIf the user explicitly opts out ("just report", "don\'t iterate", "stop after the first call", "I\'ll fix it myself"), respect that \u2014 run once, report, stop. Otherwise iterate by default. The first time you start an autonomous loop in a session, mention it once so the user knows it\'s happening (e.g. "the call surfaced X \u2014 fixing and re-running; I\'ll report back when it converges or stops making progress").\n\n## Cursor Execution\n\nUse a 5-minute shell-tool timeout (`300000` ms) on Vent run commands so normal calls are not killed by the default 2-minute Bash timeout. This is not backgrounding; wait for stdout/results before ending your response. Use the JSON returned by `npx vent-hq run` directly; do not call `vent status` unless checking an older run.\n\nCursor cannot run separate shell tool calls concurrently \u2014 for multiple calls from one suite, invoke each named call with `--call <name>` in one shell command using `&` and a final `wait`:\n\n```bash\nnpx vent-hq run -f .vent/suite.vapi.json --call happy-path & \\\nnpx vent-hq run -f .vent/suite.vapi.json --call tool-path & \\\nwait\n```\n\n## Workflow\n\n1. Identify the behavior under test. Read enough of the agent codebase to understand its system prompt, tools, handlers, routes, provider config, platform wiring, and expected handoffs.\n2. Reuse an existing `.vent/suite.<adapter>.json` when possible. If `.vent/` contains multiple suites, inspect `connection.adapter` and report which suite file produced the result.\n3. Create or update a suite only when the existing calls do not cover the changed behavior. Name calls after real flows, for example `reschedule-appointment`, not `call-1`.\n4. If the suite uses `start_command`, start one shared local session first with `npx vent-hq agent start -f .vent/suite.<adapter>.json`, then pass `--session <session-id>` to each run.\n\n **For locally-run LiveKit agents: every run requires killing *all* workers, starting one fresh worker, and waiting a full 60 seconds before submitting.** Unconditional \u2014 LiveKit Cloud round-robins across registered workers, so a single survivor with a dead inference subprocess fails ~N-1 of N calls. Don\'t rely on `pkill -f <path-pattern>`; bare command lines like `node --import tsx agent.ts dev` won\'t match a path filter. Use `ps aux | grep -E "node.*agent\\.ts|@livekit/agents.*ipc"`, `kill -9` by PID, re-run `ps` to confirm zero survivors, then start the fresh worker. Skipping the 60s wait fails with `did not publish audio track`; if that error appears alongside `Error [ERR_IPC_CHANNEL_CLOSED] from InferenceProcExecutor.doInference` in the agent log right after a "running EOU detection" line, that\'s a straggler \u2014 redo the kill sweep. Hosted LiveKit Cloud agents don\'t need any of this; run normally.\n5. Pick which call(s) to run based on the change. Fixed bug: replay the failing scenario. Changed tool: include a call that triggers that tool. Prompt or routing change: include the relevant happy path and any important edge path.\n6. Compare against the previous JSON in `.vent/runs/` when validating a fix or regression. Check status flips, latency jumps, tool-call success drops, cost jumps, and transcript divergence. Correlate with `git diff` between saved `git_sha` values when available; skip if no previous run exists.\n\n## Commands\n\n```bash\nnpx vent-hq init # First-time setup (auth + skill install + starter suite)\nnpx vent-hq login # Log in to existing account\nnpx vent-hq run -f .vent/suite.X.json # Run a single-call suite\nnpx vent-hq run -f .vent/suite.X.json --call NAME # Run one named call from a multi-call suite\nnpx vent-hq run ... --session <session-id> # Add to any run; routes through an existing local relay session\nnpx vent-hq run ... --verbose # Add to any run or status; include verbose debug fields\nnpx vent-hq stop <run-id> # Cancel a queued or running run\nnpx vent-hq status <run-id> # Fetch results for a previous run\nnpx vent-hq agent start -f .vent/suite.X.json # Start a shared local relay session\nnpx vent-hq agent stop <session-id> # Stop a shared local relay session\n```\n\nIf `~/.vent/credentials` is missing and `VENT_ACCESS_TOKEN` is not set, run `npx vent-hq init`. For an existing account, run `npx vent-hq login` or set `VENT_ACCESS_TOKEN`.\n\n## Suite Config\n\nSuites live in `.vent/suite.<adapter>.json`. `connection` is declared once per suite. `calls` is a named map, and each key becomes the call name used with `--call`.\n\nLocal websocket suite:\n\n```json\n{\n "connection": {\n "adapter": "websocket",\n "start_command": "npm run start",\n "health_endpoint": "/health",\n "agent_port": 3001\n },\n "calls": {\n "happy-path": {\n "caller_prompt": "You are Maria calling to reschedule her appointment to next Tuesday.",\n "max_turns": 8,\n "silence_threshold_ms": 1200\n }\n }\n}\n```\n\nPlatform-direct suite:\n\n```json\n{\n "connection": {\n "adapter": "vapi",\n "platform": { "provider": "vapi" }\n },\n "calls": {\n "happy-path": {\n "caller_prompt": "You are Maria calling to reschedule her appointment to next Tuesday.",\n "max_turns": 8\n }\n }\n}\n```\n\nWrite `caller_prompt` as a realistic caller with a name, goal, mood, constraints, and conditional behavior. Set `max_turns` based on flow complexity: FAQ `4-6`, booking or tool use `8-12`, complex flows `12-20`.\n\nCall fields:\n\n- `caller_prompt` and `max_turns` are required.\n- `silence_threshold_ms` must be `200-10000`. Common ranges: FAQ `800-1200`, tool calls `2000-3000`, complex reasoning `3000-5000`.\n- `persona` supports `pace`, `clarity`, `disfluencies`, `cooperation`, `emotion`, `memory`, `intent_clarity`, and `confirmation_style`.\n- `caller_audio` supports noise, speed, speakerphone, mic distance, clarity, accent, packet loss, and jitter.\n- `language` is an ISO 639-1 code such as `en`, `es`, `fr`, `de`, `it`, `nl`, or `ja`.\n- `voice` is `"male"` or `"female"` (English only; default female). Use to flip the caller\'s perceived gender. Ignored if `caller_audio.accent` is set or `language` is non-English.\n- `prosody: true` enables emotion analysis and requires Hume access.\n\n## Connections and Credentials\n\n### Adapter choice\n\nUse `websocket` for your own local or hosted runtime. Use `start_command` for local agents or `agent_url` for hosted custom endpoints. For `start_command` and `agent_url`, do not put Deepgram, ElevenLabs, OpenAI, or other agent runtime keys into Vent config unless the Vent adapter itself needs them \u2014 the tested agent owns its own runtime credentials.\n\nUse `vapi`, `retell`, `elevenlabs`, `bland`, or `livekit` for platform-direct testing. In this mode Vent itself talks to the provider on the user\'s behalf.\n\nVent provides `DEEPGRAM_API_KEY` and `ANTHROPIC_API_KEY` for its hosted caller/evaluation stack \u2014 those are Vent\'s, not the tested agent\'s.\n\n### Credential resolution\n\nIn platform-direct mode the CLI auto-resolves credentials from `.env.local`, `.env`, and the current shell environment. Do not run `source .env && export` before Vent commands. If you include credential fields in JSON, use the actual value, not the env var name. Do not manually author `platform_connection_id`; the CLI creates or updates the saved platform connection automatically.\n\nAuto-resolved env vars and JSON fields:\n\n- Vapi: `VAPI_API_KEY` -> `vapi_api_key`; `VAPI_ASSISTANT_ID` or `VAPI_AGENT_ID` -> `vapi_assistant_id`\n- Bland: `BLAND_API_KEY` -> `bland_api_key`; `BLAND_PATHWAY_ID` -> `bland_pathway_id`; `BLAND_PERSONA_ID` -> `persona_id`\n- LiveKit: `LIVEKIT_API_KEY` -> `livekit_api_key`; `LIVEKIT_API_SECRET` -> `livekit_api_secret`; `LIVEKIT_URL` -> `livekit_url`\n- Retell: `RETELL_API_KEY` -> `retell_api_key`; `RETELL_AGENT_ID` -> `retell_agent_id`\n- ElevenLabs: `ELEVENLABS_API_KEY` -> `elevenlabs_api_key`; `ELEVENLABS_AGENT_ID` -> `elevenlabs_agent_id`\n\n### Provider config\n\nUse existing provider config when possible: Vapi assistant, Retell agent, ElevenLabs agent, Bland pathway, or LiveKit agent. Bland uniquely supports inline config \u2014 `platform` may use `bland_pathway_id`, `persona_id`, or an inline `task` (with optional voice, model, and turn-handling overrides; see Bland\'s API docs for the full field list).\n\n### Concurrency\n\nWhen you fan out multiple Vent calls in parallel against the same provider (for example, running several named calls from one suite at once with `&` and `wait`), respect the provider\'s per-account concurrency limit. Exceeding it makes calls queue or fail at the provider \u2014 Vent does not enforce these caps for you.\n\nRecord the limit as `max_concurrency` in the suite\'s `platform` block so it\'s visible on future runs. Ask the user which plan they\'re on if sizing matters; otherwise use the conservative default in bold.\n\n- **Vapi**: **10** included per account; reserved lines can be purchased self-serve; Enterprise is unlimited.\n- **Retell**: Pay-as-you-go includes **20**; Enterprise has no cap.\n- **Bland**: Start=**10**, Build=50, Scale=100, Enterprise=unlimited.\n- **ElevenLabs**: Free=**4**, Starter=6, Creator=10, Pro=20, Scale=30, Business=30. Burst pricing can temporarily allow up to 3x base.\n- **LiveKit Cloud**: Build=**5**, Ship=20, Scale=50 managed inference sessions (the usual gate for voice agents); agent-session concurrency can go higher (Scale up to 600).\n\n## WebSocket\n\nFor `adapter: "websocket"`, Vent sends binary 16-bit mono PCM audio over one websocket connection. Websocket text frames are optional JSON events. Audio-only websocket agents still work, but events improve turn detection and observability. Vent sends `{"type":"end-call"}` when the test is done.\n\nUseful websocket text frames:\n\n```jsonc\n{"type":"speech-update","status":"started"}\n{"type":"speech-update","status":"stopped"}\n{"type":"tool_call","name":"check_availability","arguments":{},"result":{},"successful":true,"duration_ms":150}\n{"type":"vent:timing","stt_ms":120,"llm_ms":450,"tts_ms":80}\n{"type":"vent:session","platform":"custom","provider_call_id":"call_123","provider_session_id":"session_abc"}\n{"type":"vent:call-metadata","call_metadata":{"recording_url":"https://...","cost_usd":0.12}}\n{"type":"vent:transcript","role":"caller","text":"I need to reschedule","turn_index":0}\n{"type":"vent:transfer","destination":"+15551234567","status":"attempted"}\n{"type":"vent:debug-url","label":"trace","url":"https://..."}\n{"type":"vent:warning","message":"provider warning","code":"provider_warning"}\n```\n\n`vent:session-report` is **not** handled by the websocket adapter \u2014 it\'s only consumed by the LiveKit helper. Do not emit it from a websocket agent.\n\nPlatform adapters capture tool calls automatically. Websocket agents must emit `tool_call` frames for tool observability. Platform adapters get component latency automatically. Websocket agents should emit `vent:timing` after each agent response when STT/LLM/TTS breakdown is available.\n\n## LiveKit\n\nBefore running LiveKit tests, install and add the Vent helper to the LiveKit agent entrypoint. Node: `npm install @vent-hq/livekit`, then call `instrumentLiveKitAgent({ ctx, session })`. Python: `pip install vent-livekit`, then call `instrument_livekit_agent(ctx=ctx, session=session)`.\n\nLiveKit direct mode requires the LiveKit Agents SDK. Custom LiveKit participants should use the websocket adapter with a relay. If the LiveKit agent registered with an explicit dispatch name, set `livekit_agent_name` in `platform`.\n\nLiveKit does not support multiple concurrent Vent calls against one agent process yet. Run LiveKit calls sequentially unless you intentionally start separate agent worker processes and route each call to its own process. For Node agents, that means separate Node.js processes. Do not treat parallel calls against a single LiveKit worker as a valid concurrency test until multi-call support is engineered.\n\nUse the LiveKit helper for observability; do not publish `vent:*` topics manually. Do not hand-roll `vent:session-report` from `ctx.addShutdownCallback`; after `room.disconnect()` it can fail with `engine is closed`. The helper captures SDK metrics, tool events, conversation items, usage, and close events. Native LiveKit `lk.transcription` and `lk.agent.state` provide transcript and agent-state timing.\n\n## Output\n\n### Live result\n\n`npx vent-hq run` returns a single JSON result on stdout in non-TTY mode (not an SSE JSONL stream). Exit codes: `0` = call ran through the pipeline; `1` = pipeline-level failure; `2` = harness error.\n\nMost result fields are always present; `latency`, `component_latency`, `call_metadata`, and `emotion` may be `null` when the underlying analysis didn\'t run; `debug` is absent without `--verbose`. Branch on null before reading nested fields. Use `--verbose` only when the default doesn\'t explain a failure \u2014 when you need `platform_transcript` (to check Vent\'s STT), per-turn or component-level latency breakdowns, the raw tool-call timeline, or provider-native artifacts in `debug.provider_metadata`. Otherwise skip \u2014 it just adds noise.\n\nVent\'s transcript is ground truth. Judge on semantic intent: ignore homophones and minor mis-hears (`"check teach hat"` for `"check that"`, missing question marks on short tails) \u2014 those are streaming-STT noise on Vent\'s caller side, not agent bugs, and **don\'t surface them in the report** (they\'re Vent-side artifacts, not actionable for the user). But clear gibberish or word-soup (e.g. `"Cristoxin"` where the agent should have said `"Of course, talk soon"`) is **not** a Vent artifact \u2014 Vent\'s STT does not invent words like that. It means the platform\'s TTS produced corrupted audio or the agent\'s STT/LLM generated the wrong text, and the fix lives there (TTS voice config, agent prompt, model temperature, codec issue). Never dismiss the run as a "Vent harness STT" issue; iterate on the agent or flag the platform.\n\nFor transfers: `call_metadata.transfer_attempted` (provider claimed) and `transfer_completed` (Vent-verified) can disagree \u2014 report both. `transfers[]` carries destination, type, and per-attempt status.\n\n### Saved history\n\nAfter every run, Vent writes the full result JSON to `.vent/runs/`. Shape:\n\n```jsonc\n{\n "run_id": "...",\n "timestamp": "2026-04-21T...Z",\n "git_sha": "...",\n "summary": { "calls_total": 2, "total_duration_ms": 12345, "total_cost_usd": 0.01 },\n "call_results": [\n { "name": "happy-path", "status": "completed", "duration_ms": 6123, "transcript": [], "observed_tool_calls": [], "latency": { "response_time_ms": 420, "p95_response_time_ms": 980 }, "call_metadata": { "cost_usd": 0.004 } }\n ]\n}\n```\n\nWhen comparing against a prior run (Workflow step 6), inspect:\n\n- Run-completion status flips: `call_results[i].status` (pipeline-only \u2014 judge mission success from the transcript)\n- Latency: `call_results[i].latency.response_time_ms` (mean) or `latency.p95_response_time_ms` increased >20%\n- Tool calls: count of `call_results[i].observed_tool_calls[].successful` dropped\n- Cost: `summary.total_cost_usd` or `call_results[i].call_metadata.cost_usd` increased >30%\n- Transcript: `call_results[i].transcript` diverged in semantic content (ignore STT noise)\n\n## Reporting Results\n\nBefore reporting, read the agent\'s code to locate where the observed behavior originates. If the issue is small and you can fix it, fix it and explain what you did \u2014 don\'t ask permission first.\n\nAdapt the report shape to the call \u2014 a clean pass needs little, a regression with a multi-layer cause needs more. Use a transcript excerpt when it helps the user see what happened.\n\nHard rules:\n\n- Pair raw numbers with their plain-English meaning \u2014 don\'t drop the number, but don\'t leave it unexplained. E.g. "p95 latency was 850ms, which is snappy and well within natural conversational pacing" or "p95 hit 1.6 seconds with the LLM as the bottleneck \u2014 noticeably sluggish to a caller."\n- Name the user\'s voice agent by platform on first mention (e.g. "the Vapi agent responded snappily throughout") so the user knows immediately which agent the observation is about. After that, just say "the agent" \u2014 don\'t repeat the platform name on every line.\n- Always include the recording from `call_metadata.recording_url` as an inline `[Recording](url)` link, placed in **one block at the very end of the report** \u2014 never sprinkled through the prose. Single call: one link as the last line. Multi-call: one labeled link per call (e.g. `reschedule-appointment: [Recording](url)`). Never paste a bare URL.\n- Mission success is your judgment, not Vent\'s. The per-call `status` is only `"completed"` (pipeline ran) or `"error"` (pipeline failed); decide whether the agent actually accomplished the scenario from the transcript and tool calls.\n- Similar-sounding word substitutions (e.g. "ocean" for "OSHA") are STT ambiguity, not comprehension failure. The fix lives in STT keyword hints, custom vocabulary, or a prompt-level term list \u2014 not the agent\'s reasoning.\n- Surface only what the user can act on in their own agent\'s code or config \u2014 never `warnings[]` (infrastructure noise), Vent-side artifacts (caller wait modes, harness timing, internal pipeline quirks), or `cost_usd` unless asked.\n\nFor multi-call runs, lead with your own judgment of what happened across the calls (e.g. "3 of 4 did what they were supposed to; `cancel-appointment` never actually canceled"), not a parroted pass/fail count. Then cover each call with whatever depth it needs.\n';
5867
+ var cursor_default = '---\ndescription: Place real calls against the user\'s voice agent to verify recent code changes worked end-to-end. Trigger on test requests or after any non-trivial voice-agent change.\nalwaysApply: true\n---\n\n# Vent - Voice Agent Calls\n\n## When to Test\n\nVent is the voice CI gate. Auto-trigger a call after any non-trivial voice-affecting change \u2014 don\'t wait to be asked. Non-trivial means anything that affects what the caller hears, how the agent responds, what tools are called, how the call routes, or audio timing: prompt rewrites or sections (not one-word tweaks), tool schema or handler changes, state/memory logic, transfer logic, provider/audio config, platform integration, and any voice-behavior bug fix. One-line copy or comment edits don\'t need a call.\n\nBefore launching, scan `.vent/runs/` for unresolved issues from prior runs (failing status, regressions, warnings, latency jumps); fix in-scope ones as part of this change and verify they\'re resolved by the new run, surface out-of-scope ones rather than silently leaving them. The task is not done until you\'ve reviewed the transcript, tool calls, and observed behavior and judged that the change works \u2014 Vent\'s `status` and exit code only reflect pipeline completion, not mission success.\n\n## Autonomous Iteration\n\n**Default: iterate.** Run a fix \u2192 Vent-call \u2192 fix \u2192 Vent-call loop whenever a call surfaces a fixable issue. Don\'t report a failure (or a degraded-quality success) back to the user before attempting to fix it \u2014 they want the agent working, not a diagnosis. After each call, if the result reveals a problem you can fix with a concrete plan, apply the fix and run again. Fixable issues include: tool schema or registry gaps, missing handlers, prompt phrasing, **STT vocabulary mismatches** (e.g. domain terms misheard \u2014 Deepgram `keyterms`, custom vocabulary, prompt-level term lists), **latency tunables** (endpointing thresholds, preemptive generation, model selection), and audio quality config. If you\'d be guessing at the next attempt, stop and ask the user instead of trying.\n\n**"Mission success but with quality problems" still requires iteration.** If the call ran end-to-end but had noticeably bad latency, STT mishearing the agent\'s domain vocabulary, audio artifacts, or any other caller-experience degradation that has a concrete fix, the call is *not done* \u2014 apply the fix and re-run. Don\'t downgrade fixable problems to "observations" or "things worth flagging." The pipeline running cleanly is the floor, not the ceiling.\n\n**Never ask permission to iterate.** "Want me to apply this and re-run?" / "Should I fix it?" / "Let me know if you want me to..." \u2014 these are skill violations. If you have a concrete fix, apply it and run the next call. The user opts out by saying so explicitly; silence is not opt-out.\n\nStop iterating and report when one of:\n\n- The next call confirms the fix worked AND no further fixable issues remain (success \u2014 done).\n- The same fix fails twice (your hypothesis is wrong; ask the user).\n- The failure mode shifts between attempts (you\'re chasing a moving target).\n- You can\'t justify the next call against its provider cost \u2014 each call spends real money and provider quota.\n\nIf the user explicitly opts out ("just report", "don\'t iterate", "stop after the first call", "I\'ll fix it myself"), respect that \u2014 run once, report, stop. Otherwise iterate by default. The first time you start an autonomous loop in a session, mention it once so the user knows it\'s happening (e.g. "the call surfaced X \u2014 fixing and re-running; I\'ll report back when it converges or stops making progress").\n\n## Cursor Execution\n\nUse a 5-minute shell-tool timeout (`300000` ms) on Vent run commands so normal calls are not killed by the default 2-minute Bash timeout. This is not backgrounding; wait for stdout/results before ending your response. Use the JSON returned by `npx vent-hq run` directly.\n\nCursor cannot run separate shell tool calls concurrently \u2014 for multiple calls from one suite, invoke each named call with `--call <name>` in one shell command using `&` and a final `wait`:\n\n```bash\nnpx vent-hq run -f .vent/suite.vapi.json --call happy-path & \\\nnpx vent-hq run -f .vent/suite.vapi.json --call tool-path & \\\nwait\n```\n\n## Workflow\n\n1. Identify the behavior under test. Read enough of the agent codebase to understand its system prompt, tools, handlers, routes, provider config, platform wiring, and expected handoffs.\n2. Reuse an existing `.vent/suite.<adapter>.json` when possible. If `.vent/` contains multiple suites, inspect `connection.adapter` and report which suite file produced the result.\n3. Create or update a suite only when the existing calls do not cover the changed behavior. Name calls after real flows, for example `reschedule-appointment`, not `call-1`.\n4. If the suite uses `start_command`, start one shared local session first with `npx vent-hq agent start -f .vent/suite.<adapter>.json`, then pass `--session <session-id>` to each run.\n\n **For locally-run LiveKit agents: every run requires killing *all* workers, starting one fresh worker, and waiting a full 60 seconds before submitting.** Unconditional \u2014 LiveKit Cloud round-robins across registered workers, so a single survivor with a dead inference subprocess fails ~N-1 of N calls. Don\'t rely on `pkill -f <path-pattern>`; bare command lines like `node --import tsx agent.ts dev` won\'t match a path filter. Use `ps aux | grep -E "node.*agent\\.ts|@livekit/agents.*ipc"`, `kill -9` by PID, re-run `ps` to confirm zero survivors, then start the fresh worker. Skipping the 60s wait fails with `did not publish audio track`; if that error appears alongside `Error [ERR_IPC_CHANNEL_CLOSED] from InferenceProcExecutor.doInference` in the agent log right after a "running EOU detection" line, that\'s a straggler \u2014 redo the kill sweep. Hosted LiveKit Cloud agents don\'t need any of this; run normally.\n5. Pick which call(s) to run based on the change. Fixed bug: replay the failing scenario. Changed tool: include a call that triggers that tool. Prompt or routing change: include the relevant happy path and any important edge path.\n6. Compare against the previous JSON in `.vent/runs/` when validating a fix or regression. Check status flips, latency jumps, tool-call success drops, cost jumps, and transcript divergence. Correlate with `git diff` between saved `git_sha` values when available; skip if no previous run exists.\n\n## Commands\n\n```bash\nnpx vent-hq init # First-time setup (auth + skill install + starter suite)\nnpx vent-hq login # Log in to existing account\nnpx vent-hq login --status # Print whether credentials are present\nnpx vent-hq logout # Remove saved credentials from ~/.vent/credentials\nnpx vent-hq run -f .vent/suite.X.json # Run a single-call suite\nnpx vent-hq run -f .vent/suite.X.json --call NAME # Run one named call from a multi-call suite\nnpx vent-hq run ... --session <session-id> # Add to any run; routes through an existing local relay session\nnpx vent-hq run ... --verbose # Add to any run; include verbose debug fields\nnpx vent-hq stop <run-id> # Cancel a queued or running run\nnpx vent-hq agent start -f .vent/suite.X.json # Start a shared local relay session\nnpx vent-hq agent stop <session-id> # Stop a shared local relay session\n```\n\nIf `~/.vent/credentials` is missing and `VENT_ACCESS_TOKEN` is not set, run `npx vent-hq init`. For an existing account, run `npx vent-hq login` or set `VENT_ACCESS_TOKEN`.\n\n## Suite Config\n\nSuites live in `.vent/suite.<adapter>.json`. `connection` is declared once per suite. `calls` is a named map, and each key becomes the call name used with `--call`.\n\nLocal websocket suite:\n\n```json\n{\n "connection": {\n "adapter": "websocket",\n "start_command": "npm run start",\n "health_endpoint": "/health",\n "agent_port": 3001\n },\n "calls": {\n "happy-path": {\n "caller_prompt": "You are Maria calling to reschedule her appointment to next Tuesday.",\n "max_turns": 8,\n "silence_threshold_ms": 1200\n }\n }\n}\n```\n\nPlatform-direct suite:\n\n```json\n{\n "connection": {\n "adapter": "vapi",\n "platform": { "provider": "vapi" }\n },\n "calls": {\n "happy-path": {\n "caller_prompt": "You are Maria calling to reschedule her appointment to next Tuesday.",\n "max_turns": 8\n }\n }\n}\n```\n\nWrite `caller_prompt` as a realistic caller with a name, goal, mood, constraints, and conditional behavior. Set `max_turns` based on flow complexity: FAQ `4-6`, booking or tool use `8-12`, complex flows `12-20`.\n\nCall fields:\n\n- `caller_prompt` and `max_turns` are required.\n- `silence_threshold_ms` must be `200-10000`. Common ranges: FAQ `800-1200`, tool calls `2000-3000`, complex reasoning `3000-5000`.\n- `persona` supports `pace`, `clarity`, `disfluencies`, `cooperation`, `emotion`, `memory`, `intent_clarity`, and `confirmation_style`.\n- `caller_audio` supports noise, speed, speakerphone, mic distance, clarity, accent, packet loss, and jitter.\n- `language` is an ISO 639-1 code such as `en`, `es`, `fr`, `de`, `it`, `nl`, or `ja`.\n- `voice` is `"male"` or `"female"` (English only; default female). Use to flip the caller\'s perceived gender. Ignored if `caller_audio.accent` is set or `language` is non-English.\n\n## Connections and Credentials\n\n### Adapter choice\n\nUse `websocket` for your own local or hosted runtime. Use `start_command` for local agents or `agent_url` for hosted custom endpoints. For `start_command` and `agent_url`, do not put Deepgram, ElevenLabs, OpenAI, or other agent runtime keys into Vent config unless the Vent adapter itself needs them \u2014 the tested agent owns its own runtime credentials.\n\nUse `vapi`, `retell`, `elevenlabs`, `bland`, or `livekit` for platform-direct testing. In this mode Vent itself talks to the provider on the user\'s behalf.\n\nVent provides `DEEPGRAM_API_KEY` and `ANTHROPIC_API_KEY` for its hosted caller/evaluation stack \u2014 those are Vent\'s, not the tested agent\'s.\n\n### Credential resolution\n\nIn platform-direct mode the CLI auto-resolves credentials from `.env.local`, `.env`, and the current shell environment. Do not run `source .env && export` before Vent commands. If you include credential fields in JSON, use the actual value, not the env var name. Do not manually author `platform_connection_id`; the CLI creates or updates the saved platform connection automatically.\n\nAuto-resolved env vars and JSON fields:\n\n- Vapi: `VAPI_API_KEY` -> `vapi_api_key`; `VAPI_ASSISTANT_ID` or `VAPI_AGENT_ID` -> `vapi_assistant_id`\n- Bland: `BLAND_API_KEY` -> `bland_api_key`; `BLAND_PATHWAY_ID` -> `bland_pathway_id`; `BLAND_PERSONA_ID` -> `persona_id`\n- LiveKit: `LIVEKIT_API_KEY` -> `livekit_api_key`; `LIVEKIT_API_SECRET` -> `livekit_api_secret`; `LIVEKIT_URL` -> `livekit_url`\n- Retell: `RETELL_API_KEY` -> `retell_api_key`; `RETELL_AGENT_ID` -> `retell_agent_id`\n- ElevenLabs: `ELEVENLABS_API_KEY` -> `elevenlabs_api_key`; `ELEVENLABS_AGENT_ID` -> `elevenlabs_agent_id`\n\n### Provider config\n\nUse existing provider config when possible: Vapi assistant, Retell agent, ElevenLabs agent, Bland pathway, or LiveKit agent. Bland uniquely supports inline config \u2014 `platform` may use `bland_pathway_id`, `persona_id`, or an inline `task` (with optional voice, model, and turn-handling overrides; see Bland\'s API docs for the full field list).\n\n### Concurrency\n\nWhen you fan out multiple Vent calls in parallel against the same provider (for example, running several named calls from one suite at once with `&` and `wait`), respect the provider\'s per-account concurrency limit. Exceeding it makes calls queue or fail at the provider \u2014 Vent does not enforce these caps for you.\n\nRecord the limit as `max_concurrency` in the suite\'s `platform` block so it\'s visible on future runs. Ask the user which plan they\'re on if sizing matters; otherwise use the conservative default in bold.\n\n- **Vapi**: **10** included per account; reserved lines can be purchased self-serve; Enterprise is unlimited.\n- **Retell**: Pay-as-you-go includes **20**; Enterprise has no cap.\n- **Bland**: Start=**10**, Build=50, Scale=100, Enterprise=unlimited.\n- **ElevenLabs**: Free=**4**, Starter=6, Creator=10, Pro=20, Scale=30, Business=30. Burst pricing can temporarily allow up to 3x base.\n- **LiveKit Cloud**: Build=**5**, Ship=20, Scale=50 managed inference sessions (the usual gate for voice agents); agent-session concurrency can go higher (Scale up to 600).\n\n## WebSocket\n\nFor `adapter: "websocket"`, Vent sends binary 16-bit mono PCM audio over one websocket connection. Websocket text frames are optional JSON events. Audio-only websocket agents still work, but events improve turn detection and observability. Vent sends `{"type":"end-call"}` when the test is done.\n\nUseful websocket text frames:\n\n```jsonc\n{"type":"speech-update","status":"started"}\n{"type":"speech-update","status":"stopped"}\n{"type":"tool_call","name":"check_availability","arguments":{},"result":{},"successful":true,"duration_ms":150}\n{"type":"vent:timing","stt_ms":120,"llm_ms":450,"tts_ms":80}\n{"type":"vent:session","platform":"custom","provider_call_id":"call_123","provider_session_id":"session_abc"}\n{"type":"vent:call-metadata","call_metadata":{"recording_url":"https://...","cost_usd":0.12}}\n{"type":"vent:transcript","role":"caller","text":"I need to reschedule","turn_index":0}\n{"type":"vent:transfer","destination":"+15551234567","status":"attempted"}\n{"type":"vent:debug-url","label":"trace","url":"https://..."}\n{"type":"vent:warning","message":"provider warning","code":"provider_warning"}\n```\n\n`vent:session-report` is **not** handled by the websocket adapter \u2014 it\'s only consumed by the LiveKit helper. Do not emit it from a websocket agent.\n\nPlatform adapters capture tool calls automatically. Websocket agents must emit `tool_call` frames for tool observability. Platform adapters get component latency automatically. Websocket agents should emit `vent:timing` after each agent response when STT/LLM/TTS breakdown is available.\n\n## LiveKit\n\nBefore running LiveKit tests, install and add the Vent helper to the LiveKit agent entrypoint. Node: `npm install @vent-hq/livekit`, then call `instrumentLiveKitAgent({ ctx, session })`. Python: `pip install vent-livekit`, then call `instrument_livekit_agent(ctx=ctx, session=session)`.\n\nLiveKit direct mode requires the LiveKit Agents SDK. Custom LiveKit participants should use the websocket adapter with a relay. If the LiveKit agent registered with an explicit dispatch name, set `livekit_agent_name` in `platform`.\n\nLiveKit parallel calls are supported, but capacity depends on where the user runs their agent. LiveKit Cloud only routes audio \u2014 the agent code runs on the user\'s own infrastructure (laptop, Fly.io, Railway, k8s, etc.), and that box\'s CPU/RAM is the real bottleneck. Before firing parallel calls, derive `platform.max_concurrency` from the minimum of: (1) the user\'s LiveKit plan limit (Build=5, Ship=20, Scale=50+ concurrent agent sessions), (2) their agent worker box capacity (rough rule: 1c/1GB\u21921\u20133 jobs, 2c/4GB\u21925\u201310, 4c/8GB\u219210\u201325, 8c/16GB\u219225\u201350; multiply by number of worker boxes), and (3) their STT/TTS provider concurrency (e.g. Deepgram TTS streaming = 2). If the user wants more parallel calls than that minimum allows, push back \u2014 explain which limit is the bottleneck (plan / box / provider) and either reduce the count or suggest scaling that limit. Architecture: register one `agent_name` and run N workers under it (one big worker, or several smaller ones \u2014 same name); LiveKit\'s dispatcher round-robins jobs across them with built-in failover. Don\'t use numbered names (`agent-0`, `agent-1`) \u2014 that reinvents what the dispatcher does and breaks failover.\n\nUse the LiveKit helper for observability; do not publish `vent:*` topics manually. Do not hand-roll `vent:session-report` from `ctx.addShutdownCallback`; after `room.disconnect()` it can fail with `engine is closed`. The helper captures SDK metrics, tool events, conversation items, usage, and close events. Native LiveKit `lk.transcription` and `lk.agent.state` provide transcript and agent-state timing.\n\n## Output\n\n### Live result\n\n`npx vent-hq run` returns a single JSON result on stdout in non-TTY mode (not an SSE JSONL stream). Exit codes: `0` = call ran through the pipeline; `1` = pipeline-level failure; `2` = harness error.\n\nMost result fields are always present; `latency`, `component_latency`, `call_metadata`, and `emotion` may be `null` when the underlying analysis didn\'t run; `debug` is absent without `--verbose`. Branch on null before reading nested fields. Use `--verbose` only when the default doesn\'t explain a failure \u2014 when you need `platform_transcript` (to check Vent\'s STT), per-turn or component-level latency breakdowns, the raw tool-call timeline, or provider-native artifacts in `debug.provider_metadata`. Otherwise skip \u2014 it just adds noise.\n\nVent\'s transcript is ground truth. Judge on semantic intent: ignore homophones and minor mis-hears (`"check teach hat"` for `"check that"`, missing question marks on short tails) \u2014 those are streaming-STT noise on Vent\'s caller side, not agent bugs, and **don\'t surface them in the report** (they\'re Vent-side artifacts, not actionable for the user). But clear gibberish or word-soup (e.g. `"Cristoxin"` where the agent should have said `"Of course, talk soon"`) is **not** a Vent artifact \u2014 Vent\'s STT does not invent words like that. It means the platform\'s TTS produced corrupted audio or the agent\'s STT/LLM generated the wrong text, and the fix lives there (TTS voice config, agent prompt, model temperature, codec issue). Never dismiss the run as a "Vent harness STT" issue; iterate on the agent or flag the platform.\n\nFor transfers: `call_metadata.transfer_attempted` (provider claimed) and `transfer_completed` (Vent-verified) can disagree \u2014 report both. `transfers[]` carries destination, type, and per-attempt status.\n\n### Saved history\n\nAfter every run, Vent writes the full result JSON to `.vent/runs/`. Shape:\n\n```jsonc\n{\n "run_id": "...",\n "timestamp": "2026-04-21T...Z",\n "git_sha": "...",\n "summary": { "calls_total": 2, "total_duration_ms": 12345, "total_cost_usd": 0.01 },\n "call_results": [\n { "name": "happy-path", "status": "completed", "duration_ms": 6123, "transcript": [], "observed_tool_calls": [], "latency": { "response_time_ms": 420, "p95_response_time_ms": 980 }, "call_metadata": { "cost_usd": 0.004 } }\n ]\n}\n```\n\nWhen comparing against a prior run (Workflow step 6), inspect:\n\n- Run-completion status flips: `call_results[i].status` (pipeline-only \u2014 judge mission success from the transcript)\n- Latency: `call_results[i].latency.response_time_ms` (mean) or `latency.p95_response_time_ms` increased >20%\n- Tool calls: count of `call_results[i].observed_tool_calls[].successful` dropped\n- Cost: `summary.total_cost_usd` or `call_results[i].call_metadata.cost_usd` increased >30%\n- Transcript: `call_results[i].transcript` diverged in semantic content (ignore STT noise)\n\n## Reporting Results\n\nBefore reporting, read the agent\'s code to locate where the observed behavior originates. If the issue is small and you can fix it, fix it and explain what you did \u2014 don\'t ask permission first.\n\nAdapt the report shape to the call \u2014 a clean pass needs little, a regression with a multi-layer cause needs more. Use a transcript excerpt when it helps the user see what happened.\n\nHard rules:\n\n- Pair raw numbers with their plain-English meaning \u2014 don\'t drop the number, but don\'t leave it unexplained. E.g. "p95 latency was 850ms, which is snappy and well within natural conversational pacing" or "p95 hit 1.6 seconds with the LLM as the bottleneck \u2014 noticeably sluggish to a caller."\n- Name the user\'s voice agent by platform on first mention (e.g. "the Vapi agent responded snappily throughout") so the user knows immediately which agent the observation is about. After that, just say "the agent" \u2014 don\'t repeat the platform name on every line.\n- Always include the recording from `call_metadata.recording_url` as an inline `[Recording](url)` link, placed in **one block at the very end of the report** \u2014 never sprinkled through the prose. Single call: one link as the last line. Multi-call: one labeled link per call (e.g. `reschedule-appointment: [Recording](url)`). Never paste a bare URL.\n- Mission success is your judgment, not Vent\'s. The per-call `status` is only `"completed"` (pipeline ran) or `"error"` (pipeline failed); decide whether the agent actually accomplished the scenario from the transcript and tool calls.\n- Similar-sounding word substitutions (e.g. "ocean" for "OSHA") are STT ambiguity, not comprehension failure. The fix lives in STT keyword hints, custom vocabulary, or a prompt-level term list \u2014 not the agent\'s reasoning.\n- Surface only what the user can act on in their own agent\'s code or config \u2014 never `warnings[]` (infrastructure noise), Vent-side artifacts (caller wait modes, harness timing, internal pipeline quirks), or `cost_usd` unless asked.\n\nFor multi-call runs, lead with your own judgment of what happened across the calls (e.g. "3 of 4 did what they were supposed to; `cancel-appointment` never actually canceled"), not a parroted pass/fail count. Then cover each call with whatever depth it needs.\n';
5943
5868
 
5944
5869
  // src/skills/codex.md
5945
- var codex_default = '# Vent - Voice Agent Calls\n\n## When to Test\n\nVent is the voice CI gate. Auto-trigger a call after any non-trivial voice-affecting change \u2014 don\'t wait to be asked. Non-trivial means anything that affects what the caller hears, how the agent responds, what tools are called, how the call routes, or audio timing: prompt rewrites or sections (not one-word tweaks), tool schema or handler changes, state/memory logic, transfer logic, provider/audio config, platform integration, and any voice-behavior bug fix. One-line copy or comment edits don\'t need a call.\n\nBefore launching, scan `.vent/runs/` for unresolved issues from prior runs (failing status, regressions, warnings, latency jumps); fix in-scope ones as part of this change and verify they\'re resolved by the new run, surface out-of-scope ones rather than silently leaving them. The task is not done until you\'ve reviewed the transcript, tool calls, and observed behavior and judged that the change works \u2014 Vent\'s `status` and exit code only reflect pipeline completion, not mission success.\n\n## Autonomous Iteration\n\n**Default: iterate.** Run a fix \u2192 Vent-call \u2192 fix \u2192 Vent-call loop whenever a call surfaces a fixable issue. Don\'t report a failure (or a degraded-quality success) back to the user before attempting to fix it \u2014 they want the agent working, not a diagnosis. After each call, if the result reveals a problem you can fix with a concrete plan, apply the fix and run again. Fixable issues include: tool schema or registry gaps, missing handlers, prompt phrasing, **STT vocabulary mismatches** (e.g. domain terms misheard \u2014 Deepgram `keyterms`, custom vocabulary, prompt-level term lists), **latency tunables** (endpointing thresholds, preemptive generation, model selection), and audio quality config. If you\'d be guessing at the next attempt, stop and ask the user instead of trying.\n\n**"Mission success but with quality problems" still requires iteration.** If the call ran end-to-end but had noticeably bad latency, STT mishearing the agent\'s domain vocabulary, audio artifacts, or any other caller-experience degradation that has a concrete fix, the call is *not done* \u2014 apply the fix and re-run. Don\'t downgrade fixable problems to "observations" or "things worth flagging." The pipeline running cleanly is the floor, not the ceiling.\n\n**Never ask permission to iterate.** "Want me to apply this and re-run?" / "Should I fix it?" / "Let me know if you want me to..." \u2014 these are skill violations. If you have a concrete fix, apply it and run the next call. The user opts out by saying so explicitly; silence is not opt-out.\n\nStop iterating and report when one of:\n\n- The next call confirms the fix worked AND no further fixable issues remain (success \u2014 done).\n- The same fix fails twice (your hypothesis is wrong; ask the user).\n- The failure mode shifts between attempts (you\'re chasing a moving target).\n- You can\'t justify the next call against its provider cost \u2014 each call spends real money and provider quota.\n\nIf the user explicitly opts out ("just report", "don\'t iterate", "stop after the first call", "I\'ll fix it myself"), respect that \u2014 run once, report, stop. Otherwise iterate by default. The first time you start an autonomous loop in a session, mention it once so the user knows it\'s happening (e.g. "the call surfaced X \u2014 fixing and re-running; I\'ll report back when it converges or stops making progress").\n\n## Codex Execution\n\nUse a 5-minute shell-tool timeout (`300000` ms) on Vent run commands so normal calls are not killed by the default 2-minute Bash timeout. This is not backgrounding; wait for stdout/results before ending your response. Use the JSON returned by `npx vent-hq run` directly; do not call `vent status` unless checking an older run.\n\nCodex can run shell tool calls concurrently \u2014 for multiple calls from one suite, run each named call as its own parallel shell tool call (do not combine them with `&`):\n\n```bash\nnpx vent-hq run -f .vent/suite.vapi.json --call happy-path\nnpx vent-hq run -f .vent/suite.vapi.json --call tool-path\n```\n\n## Workflow\n\n1. Identify the behavior under test. Read enough of the agent codebase to understand its system prompt, tools, handlers, routes, provider config, platform wiring, and expected handoffs.\n2. Reuse an existing `.vent/suite.<adapter>.json` when possible. If `.vent/` contains multiple suites, inspect `connection.adapter` and report which suite file produced the result.\n3. Create or update a suite only when the existing calls do not cover the changed behavior. Name calls after real flows, for example `reschedule-appointment`, not `call-1`.\n4. If the suite uses `start_command`, start one shared local session first with `npx vent-hq agent start -f .vent/suite.<adapter>.json`, then pass `--session <session-id>` to each run.\n\n **For locally-run LiveKit agents: every run requires killing *all* workers, starting one fresh worker, and waiting a full 60 seconds before submitting.** Unconditional \u2014 LiveKit Cloud round-robins across registered workers, so a single survivor with a dead inference subprocess fails ~N-1 of N calls. Don\'t rely on `pkill -f <path-pattern>`; bare command lines like `node --import tsx agent.ts dev` won\'t match a path filter. Use `ps aux | grep -E "node.*agent\\.ts|@livekit/agents.*ipc"`, `kill -9` by PID, re-run `ps` to confirm zero survivors, then start the fresh worker. Skipping the 60s wait fails with `did not publish audio track`; if that error appears alongside `Error [ERR_IPC_CHANNEL_CLOSED] from InferenceProcExecutor.doInference` in the agent log right after a "running EOU detection" line, that\'s a straggler \u2014 redo the kill sweep. Hosted LiveKit Cloud agents don\'t need any of this; run normally.\n5. Pick which call(s) to run based on the change. Fixed bug: replay the failing scenario. Changed tool: include a call that triggers that tool. Prompt or routing change: include the relevant happy path and any important edge path.\n6. Compare against the previous JSON in `.vent/runs/` when validating a fix or regression. Check status flips, latency jumps, tool-call success drops, cost jumps, and transcript divergence. Correlate with `git diff` between saved `git_sha` values when available; skip if no previous run exists.\n\n## Commands\n\n```bash\nnpx vent-hq init # First-time setup (auth + skill install + starter suite)\nnpx vent-hq login # Log in to existing account\nnpx vent-hq run -f .vent/suite.X.json # Run a single-call suite\nnpx vent-hq run -f .vent/suite.X.json --call NAME # Run one named call from a multi-call suite\nnpx vent-hq run ... --session <session-id> # Add to any run; routes through an existing local relay session\nnpx vent-hq run ... --verbose # Add to any run or status; include verbose debug fields\nnpx vent-hq stop <run-id> # Cancel a queued or running run\nnpx vent-hq status <run-id> # Fetch results for a previous run\nnpx vent-hq agent start -f .vent/suite.X.json # Start a shared local relay session\nnpx vent-hq agent stop <session-id> # Stop a shared local relay session\n```\n\nIf `~/.vent/credentials` is missing and `VENT_ACCESS_TOKEN` is not set, run `npx vent-hq init`. For an existing account, run `npx vent-hq login` or set `VENT_ACCESS_TOKEN`.\n\n## Suite Config\n\nSuites live in `.vent/suite.<adapter>.json`. `connection` is declared once per suite. `calls` is a named map, and each key becomes the call name used with `--call`.\n\nLocal websocket suite:\n\n```json\n{\n "connection": {\n "adapter": "websocket",\n "start_command": "npm run start",\n "health_endpoint": "/health",\n "agent_port": 3001\n },\n "calls": {\n "happy-path": {\n "caller_prompt": "You are Maria calling to reschedule her appointment to next Tuesday.",\n "max_turns": 8,\n "silence_threshold_ms": 1200\n }\n }\n}\n```\n\nPlatform-direct suite:\n\n```json\n{\n "connection": {\n "adapter": "vapi",\n "platform": { "provider": "vapi" }\n },\n "calls": {\n "happy-path": {\n "caller_prompt": "You are Maria calling to reschedule her appointment to next Tuesday.",\n "max_turns": 8\n }\n }\n}\n```\n\nWrite `caller_prompt` as a realistic caller with a name, goal, mood, constraints, and conditional behavior. Set `max_turns` based on flow complexity: FAQ `4-6`, booking or tool use `8-12`, complex flows `12-20`.\n\nCall fields:\n\n- `caller_prompt` and `max_turns` are required.\n- `silence_threshold_ms` must be `200-10000`. Common ranges: FAQ `800-1200`, tool calls `2000-3000`, complex reasoning `3000-5000`.\n- `persona` supports `pace`, `clarity`, `disfluencies`, `cooperation`, `emotion`, `memory`, `intent_clarity`, and `confirmation_style`.\n- `caller_audio` supports noise, speed, speakerphone, mic distance, clarity, accent, packet loss, and jitter.\n- `language` is an ISO 639-1 code such as `en`, `es`, `fr`, `de`, `it`, `nl`, or `ja`.\n- `voice` is `"male"` or `"female"` (English only; default female). Use to flip the caller\'s perceived gender. Ignored if `caller_audio.accent` is set or `language` is non-English.\n- `prosody: true` enables emotion analysis and requires Hume access.\n\n## Connections and Credentials\n\n### Adapter choice\n\nUse `websocket` for your own local or hosted runtime. Use `start_command` for local agents or `agent_url` for hosted custom endpoints. For `start_command` and `agent_url`, do not put Deepgram, ElevenLabs, OpenAI, or other agent runtime keys into Vent config unless the Vent adapter itself needs them \u2014 the tested agent owns its own runtime credentials.\n\nUse `vapi`, `retell`, `elevenlabs`, `bland`, or `livekit` for platform-direct testing. In this mode Vent itself talks to the provider on the user\'s behalf.\n\nVent provides `DEEPGRAM_API_KEY` and `ANTHROPIC_API_KEY` for its hosted caller/evaluation stack \u2014 those are Vent\'s, not the tested agent\'s.\n\n### Credential resolution\n\nIn platform-direct mode the CLI auto-resolves credentials from `.env.local`, `.env`, and the current shell environment. Do not run `source .env && export` before Vent commands. If you include credential fields in JSON, use the actual value, not the env var name. Do not manually author `platform_connection_id`; the CLI creates or updates the saved platform connection automatically.\n\nAuto-resolved env vars and JSON fields:\n\n- Vapi: `VAPI_API_KEY` -> `vapi_api_key`; `VAPI_ASSISTANT_ID` or `VAPI_AGENT_ID` -> `vapi_assistant_id`\n- Bland: `BLAND_API_KEY` -> `bland_api_key`; `BLAND_PATHWAY_ID` -> `bland_pathway_id`; `BLAND_PERSONA_ID` -> `persona_id`\n- LiveKit: `LIVEKIT_API_KEY` -> `livekit_api_key`; `LIVEKIT_API_SECRET` -> `livekit_api_secret`; `LIVEKIT_URL` -> `livekit_url`\n- Retell: `RETELL_API_KEY` -> `retell_api_key`; `RETELL_AGENT_ID` -> `retell_agent_id`\n- ElevenLabs: `ELEVENLABS_API_KEY` -> `elevenlabs_api_key`; `ELEVENLABS_AGENT_ID` -> `elevenlabs_agent_id`\n\n### Provider config\n\nUse existing provider config when possible: Vapi assistant, Retell agent, ElevenLabs agent, Bland pathway, or LiveKit agent. Bland uniquely supports inline config \u2014 `platform` may use `bland_pathway_id`, `persona_id`, or an inline `task` (with optional voice, model, and turn-handling overrides; see Bland\'s API docs for the full field list).\n\n### Concurrency\n\nWhen you fan out multiple Vent calls in parallel against the same provider (for example, running several named calls from one suite at once), respect the provider\'s per-account concurrency limit. Exceeding it makes calls queue or fail at the provider \u2014 Vent does not enforce these caps for you.\n\nRecord the limit as `max_concurrency` in the suite\'s `platform` block so it\'s visible on future runs. Ask the user which plan they\'re on if sizing matters; otherwise use the conservative default in bold.\n\n- **Vapi**: **10** included per account; reserved lines can be purchased self-serve; Enterprise is unlimited.\n- **Retell**: Pay-as-you-go includes **20**; Enterprise has no cap.\n- **Bland**: Start=**10**, Build=50, Scale=100, Enterprise=unlimited.\n- **ElevenLabs**: Free=**4**, Starter=6, Creator=10, Pro=20, Scale=30, Business=30. Burst pricing can temporarily allow up to 3x base.\n- **LiveKit Cloud**: Build=**5**, Ship=20, Scale=50 managed inference sessions (the usual gate for voice agents); agent-session concurrency can go higher (Scale up to 600).\n\n## WebSocket\n\nFor `adapter: "websocket"`, Vent sends binary 16-bit mono PCM audio over one websocket connection. Websocket text frames are optional JSON events. Audio-only websocket agents still work, but events improve turn detection and observability. Vent sends `{"type":"end-call"}` when the test is done.\n\nUseful websocket text frames:\n\n```jsonc\n{"type":"speech-update","status":"started"}\n{"type":"speech-update","status":"stopped"}\n{"type":"tool_call","name":"check_availability","arguments":{},"result":{},"successful":true,"duration_ms":150}\n{"type":"vent:timing","stt_ms":120,"llm_ms":450,"tts_ms":80}\n{"type":"vent:session","platform":"custom","provider_call_id":"call_123","provider_session_id":"session_abc"}\n{"type":"vent:call-metadata","call_metadata":{"recording_url":"https://...","cost_usd":0.12}}\n{"type":"vent:transcript","role":"caller","text":"I need to reschedule","turn_index":0}\n{"type":"vent:transfer","destination":"+15551234567","status":"attempted"}\n{"type":"vent:debug-url","label":"trace","url":"https://..."}\n{"type":"vent:warning","message":"provider warning","code":"provider_warning"}\n```\n\n`vent:session-report` is **not** handled by the websocket adapter \u2014 it\'s only consumed by the LiveKit helper. Do not emit it from a websocket agent.\n\nPlatform adapters capture tool calls automatically. Websocket agents must emit `tool_call` frames for tool observability. Platform adapters get component latency automatically. Websocket agents should emit `vent:timing` after each agent response when STT/LLM/TTS breakdown is available.\n\n## LiveKit\n\nBefore running LiveKit tests, install and add the Vent helper to the LiveKit agent entrypoint. Node: `npm install @vent-hq/livekit`, then call `instrumentLiveKitAgent({ ctx, session })`. Python: `pip install vent-livekit`, then call `instrument_livekit_agent(ctx=ctx, session=session)`.\n\nLiveKit direct mode requires the LiveKit Agents SDK. Custom LiveKit participants should use the websocket adapter with a relay. If the LiveKit agent registered with an explicit dispatch name, set `livekit_agent_name` in `platform`.\n\nLiveKit does not support multiple concurrent Vent calls against one agent process yet. Run LiveKit calls sequentially unless you intentionally start separate agent worker processes and route each call to its own process. For Node agents, that means separate Node.js processes. Do not treat parallel calls against a single LiveKit worker as a valid concurrency test until multi-call support is engineered.\n\nUse the LiveKit helper for observability; do not publish `vent:*` topics manually. Do not hand-roll `vent:session-report` from `ctx.addShutdownCallback`; after `room.disconnect()` it can fail with `engine is closed`. The helper captures SDK metrics, tool events, conversation items, usage, and close events. Native LiveKit `lk.transcription` and `lk.agent.state` provide transcript and agent-state timing.\n\n## Output\n\n### Live result\n\n`npx vent-hq run` returns a single JSON result on stdout in non-TTY mode (not an SSE JSONL stream). Exit codes: `0` = call ran through the pipeline; `1` = pipeline-level failure; `2` = harness error.\n\nMost result fields are always present; `latency`, `component_latency`, `call_metadata`, and `emotion` may be `null` when the underlying analysis didn\'t run; `debug` is absent without `--verbose`. Branch on null before reading nested fields. Use `--verbose` only when the default doesn\'t explain a failure \u2014 when you need `platform_transcript` (to check Vent\'s STT), per-turn or component-level latency breakdowns, the raw tool-call timeline, or provider-native artifacts in `debug.provider_metadata`. Otherwise skip \u2014 it just adds noise.\n\nVent\'s transcript is ground truth. Judge on semantic intent: ignore homophones and minor mis-hears (`"check teach hat"` for `"check that"`, missing question marks on short tails) \u2014 those are streaming-STT noise on Vent\'s caller side, not agent bugs, and **don\'t surface them in the report** (they\'re Vent-side artifacts, not actionable for the user). But clear gibberish or word-soup (e.g. `"Cristoxin"` where the agent should have said `"Of course, talk soon"`) is **not** a Vent artifact \u2014 Vent\'s STT does not invent words like that. It means the platform\'s TTS produced corrupted audio or the agent\'s STT/LLM generated the wrong text, and the fix lives there (TTS voice config, agent prompt, model temperature, codec issue). Never dismiss the run as a "Vent harness STT" issue; iterate on the agent or flag the platform.\n\nFor transfers: `call_metadata.transfer_attempted` (provider claimed) and `transfer_completed` (Vent-verified) can disagree \u2014 report both. `transfers[]` carries destination, type, and per-attempt status.\n\n### Saved history\n\nAfter every run, Vent writes the full result JSON to `.vent/runs/`. Shape:\n\n```jsonc\n{\n "run_id": "...",\n "timestamp": "2026-04-21T...Z",\n "git_sha": "...",\n "summary": { "calls_total": 2, "total_duration_ms": 12345, "total_cost_usd": 0.01 },\n "call_results": [\n { "name": "happy-path", "status": "completed", "duration_ms": 6123, "transcript": [], "observed_tool_calls": [], "latency": { "response_time_ms": 420, "p95_response_time_ms": 980 }, "call_metadata": { "cost_usd": 0.004 } }\n ]\n}\n```\n\nWhen comparing against a prior run (Workflow step 6), inspect:\n\n- Run-completion status flips: `call_results[i].status` (pipeline-only \u2014 judge mission success from the transcript)\n- Latency: `call_results[i].latency.response_time_ms` (mean) or `latency.p95_response_time_ms` increased >20%\n- Tool calls: count of `call_results[i].observed_tool_calls[].successful` dropped\n- Cost: `summary.total_cost_usd` or `call_results[i].call_metadata.cost_usd` increased >30%\n- Transcript: `call_results[i].transcript` diverged in semantic content (ignore STT noise)\n\n## Reporting Results\n\nBefore reporting, read the agent\'s code to locate where the observed behavior originates. If the issue is small and you can fix it, fix it and explain what you did \u2014 don\'t ask permission first.\n\nAdapt the report shape to the call \u2014 a clean pass needs little, a regression with a multi-layer cause needs more. Use a transcript excerpt when it helps the user see what happened.\n\nHard rules:\n\n- Pair raw numbers with their plain-English meaning \u2014 don\'t drop the number, but don\'t leave it unexplained. E.g. "p95 latency was 850ms, which is snappy and well within natural conversational pacing" or "p95 hit 1.6 seconds with the LLM as the bottleneck \u2014 noticeably sluggish to a caller."\n- Name the user\'s voice agent by platform on first mention (e.g. "the Vapi agent responded snappily throughout") so the user knows immediately which agent the observation is about. After that, just say "the agent" \u2014 don\'t repeat the platform name on every line.\n- Always include the recording from `call_metadata.recording_url` as an inline `[Recording](url)` link, placed in **one block at the very end of the report** \u2014 never sprinkled through the prose. Single call: one link as the last line. Multi-call: one labeled link per call (e.g. `reschedule-appointment: [Recording](url)`). Never paste a bare URL.\n- Mission success is your judgment, not Vent\'s. The per-call `status` is only `"completed"` (pipeline ran) or `"error"` (pipeline failed); decide whether the agent actually accomplished the scenario from the transcript and tool calls.\n- Similar-sounding word substitutions (e.g. "ocean" for "OSHA") are STT ambiguity, not comprehension failure. The fix lives in STT keyword hints, custom vocabulary, or a prompt-level term list \u2014 not the agent\'s reasoning.\n- Surface only what the user can act on in their own agent\'s code or config \u2014 never `warnings[]` (infrastructure noise), Vent-side artifacts (caller wait modes, harness timing, internal pipeline quirks), or `cost_usd` unless asked.\n\nFor multi-call runs, lead with your own judgment of what happened across the calls (e.g. "3 of 4 did what they were supposed to; `cancel-appointment` never actually canceled"), not a parroted pass/fail count. Then cover each call with whatever depth it needs.\n';
5870
+ var codex_default = '# Vent - Voice Agent Calls\n\n## When to Test\n\nVent is the voice CI gate. Auto-trigger a call after any non-trivial voice-affecting change \u2014 don\'t wait to be asked. Non-trivial means anything that affects what the caller hears, how the agent responds, what tools are called, how the call routes, or audio timing: prompt rewrites or sections (not one-word tweaks), tool schema or handler changes, state/memory logic, transfer logic, provider/audio config, platform integration, and any voice-behavior bug fix. One-line copy or comment edits don\'t need a call.\n\nBefore launching, scan `.vent/runs/` for unresolved issues from prior runs (failing status, regressions, warnings, latency jumps); fix in-scope ones as part of this change and verify they\'re resolved by the new run, surface out-of-scope ones rather than silently leaving them. The task is not done until you\'ve reviewed the transcript, tool calls, and observed behavior and judged that the change works \u2014 Vent\'s `status` and exit code only reflect pipeline completion, not mission success.\n\n## Autonomous Iteration\n\n**Default: iterate.** Run a fix \u2192 Vent-call \u2192 fix \u2192 Vent-call loop whenever a call surfaces a fixable issue. Don\'t report a failure (or a degraded-quality success) back to the user before attempting to fix it \u2014 they want the agent working, not a diagnosis. After each call, if the result reveals a problem you can fix with a concrete plan, apply the fix and run again. Fixable issues include: tool schema or registry gaps, missing handlers, prompt phrasing, **STT vocabulary mismatches** (e.g. domain terms misheard \u2014 Deepgram `keyterms`, custom vocabulary, prompt-level term lists), **latency tunables** (endpointing thresholds, preemptive generation, model selection), and audio quality config. If you\'d be guessing at the next attempt, stop and ask the user instead of trying.\n\n**"Mission success but with quality problems" still requires iteration.** If the call ran end-to-end but had noticeably bad latency, STT mishearing the agent\'s domain vocabulary, audio artifacts, or any other caller-experience degradation that has a concrete fix, the call is *not done* \u2014 apply the fix and re-run. Don\'t downgrade fixable problems to "observations" or "things worth flagging." The pipeline running cleanly is the floor, not the ceiling.\n\n**Never ask permission to iterate.** "Want me to apply this and re-run?" / "Should I fix it?" / "Let me know if you want me to..." \u2014 these are skill violations. If you have a concrete fix, apply it and run the next call. The user opts out by saying so explicitly; silence is not opt-out.\n\nStop iterating and report when one of:\n\n- The next call confirms the fix worked AND no further fixable issues remain (success \u2014 done).\n- The same fix fails twice (your hypothesis is wrong; ask the user).\n- The failure mode shifts between attempts (you\'re chasing a moving target).\n- You can\'t justify the next call against its provider cost \u2014 each call spends real money and provider quota.\n\nIf the user explicitly opts out ("just report", "don\'t iterate", "stop after the first call", "I\'ll fix it myself"), respect that \u2014 run once, report, stop. Otherwise iterate by default. The first time you start an autonomous loop in a session, mention it once so the user knows it\'s happening (e.g. "the call surfaced X \u2014 fixing and re-running; I\'ll report back when it converges or stops making progress").\n\n## Codex Execution\n\nUse a 5-minute shell-tool timeout (`300000` ms) on Vent run commands so normal calls are not killed by the default 2-minute Bash timeout. This is not backgrounding; wait for stdout/results before ending your response. Use the JSON returned by `npx vent-hq run` directly.\n\nCodex can run shell tool calls concurrently \u2014 for multiple calls from one suite, run each named call as its own parallel shell tool call (do not combine them with `&`):\n\n```bash\nnpx vent-hq run -f .vent/suite.vapi.json --call happy-path\nnpx vent-hq run -f .vent/suite.vapi.json --call tool-path\n```\n\n## Workflow\n\n1. Identify the behavior under test. Read enough of the agent codebase to understand its system prompt, tools, handlers, routes, provider config, platform wiring, and expected handoffs.\n2. Reuse an existing `.vent/suite.<adapter>.json` when possible. If `.vent/` contains multiple suites, inspect `connection.adapter` and report which suite file produced the result.\n3. Create or update a suite only when the existing calls do not cover the changed behavior. Name calls after real flows, for example `reschedule-appointment`, not `call-1`.\n4. If the suite uses `start_command`, start one shared local session first with `npx vent-hq agent start -f .vent/suite.<adapter>.json`, then pass `--session <session-id>` to each run.\n\n **For locally-run LiveKit agents: every run requires killing *all* workers, starting one fresh worker, and waiting a full 60 seconds before submitting.** Unconditional \u2014 LiveKit Cloud round-robins across registered workers, so a single survivor with a dead inference subprocess fails ~N-1 of N calls. Don\'t rely on `pkill -f <path-pattern>`; bare command lines like `node --import tsx agent.ts dev` won\'t match a path filter. Use `ps aux | grep -E "node.*agent\\.ts|@livekit/agents.*ipc"`, `kill -9` by PID, re-run `ps` to confirm zero survivors, then start the fresh worker. Skipping the 60s wait fails with `did not publish audio track`; if that error appears alongside `Error [ERR_IPC_CHANNEL_CLOSED] from InferenceProcExecutor.doInference` in the agent log right after a "running EOU detection" line, that\'s a straggler \u2014 redo the kill sweep. Hosted LiveKit Cloud agents don\'t need any of this; run normally.\n5. Pick which call(s) to run based on the change. Fixed bug: replay the failing scenario. Changed tool: include a call that triggers that tool. Prompt or routing change: include the relevant happy path and any important edge path.\n6. Compare against the previous JSON in `.vent/runs/` when validating a fix or regression. Check status flips, latency jumps, tool-call success drops, cost jumps, and transcript divergence. Correlate with `git diff` between saved `git_sha` values when available; skip if no previous run exists.\n\n## Commands\n\n```bash\nnpx vent-hq init # First-time setup (auth + skill install + starter suite)\nnpx vent-hq login # Log in to existing account\nnpx vent-hq login --status # Print whether credentials are present\nnpx vent-hq logout # Remove saved credentials from ~/.vent/credentials\nnpx vent-hq run -f .vent/suite.X.json # Run a single-call suite\nnpx vent-hq run -f .vent/suite.X.json --call NAME # Run one named call from a multi-call suite\nnpx vent-hq run ... --session <session-id> # Add to any run; routes through an existing local relay session\nnpx vent-hq run ... --verbose # Add to any run; include verbose debug fields\nnpx vent-hq stop <run-id> # Cancel a queued or running run\nnpx vent-hq agent start -f .vent/suite.X.json # Start a shared local relay session\nnpx vent-hq agent stop <session-id> # Stop a shared local relay session\n```\n\nIf `~/.vent/credentials` is missing and `VENT_ACCESS_TOKEN` is not set, run `npx vent-hq init`. For an existing account, run `npx vent-hq login` or set `VENT_ACCESS_TOKEN`.\n\n## Suite Config\n\nSuites live in `.vent/suite.<adapter>.json`. `connection` is declared once per suite. `calls` is a named map, and each key becomes the call name used with `--call`.\n\nLocal websocket suite:\n\n```json\n{\n "connection": {\n "adapter": "websocket",\n "start_command": "npm run start",\n "health_endpoint": "/health",\n "agent_port": 3001\n },\n "calls": {\n "happy-path": {\n "caller_prompt": "You are Maria calling to reschedule her appointment to next Tuesday.",\n "max_turns": 8,\n "silence_threshold_ms": 1200\n }\n }\n}\n```\n\nPlatform-direct suite:\n\n```json\n{\n "connection": {\n "adapter": "vapi",\n "platform": { "provider": "vapi" }\n },\n "calls": {\n "happy-path": {\n "caller_prompt": "You are Maria calling to reschedule her appointment to next Tuesday.",\n "max_turns": 8\n }\n }\n}\n```\n\nWrite `caller_prompt` as a realistic caller with a name, goal, mood, constraints, and conditional behavior. Set `max_turns` based on flow complexity: FAQ `4-6`, booking or tool use `8-12`, complex flows `12-20`.\n\nCall fields:\n\n- `caller_prompt` and `max_turns` are required.\n- `silence_threshold_ms` must be `200-10000`. Common ranges: FAQ `800-1200`, tool calls `2000-3000`, complex reasoning `3000-5000`.\n- `persona` supports `pace`, `clarity`, `disfluencies`, `cooperation`, `emotion`, `memory`, `intent_clarity`, and `confirmation_style`.\n- `caller_audio` supports noise, speed, speakerphone, mic distance, clarity, accent, packet loss, and jitter.\n- `language` is an ISO 639-1 code such as `en`, `es`, `fr`, `de`, `it`, `nl`, or `ja`.\n- `voice` is `"male"` or `"female"` (English only; default female). Use to flip the caller\'s perceived gender. Ignored if `caller_audio.accent` is set or `language` is non-English.\n\n## Connections and Credentials\n\n### Adapter choice\n\nUse `websocket` for your own local or hosted runtime. Use `start_command` for local agents or `agent_url` for hosted custom endpoints. For `start_command` and `agent_url`, do not put Deepgram, ElevenLabs, OpenAI, or other agent runtime keys into Vent config unless the Vent adapter itself needs them \u2014 the tested agent owns its own runtime credentials.\n\nUse `vapi`, `retell`, `elevenlabs`, `bland`, or `livekit` for platform-direct testing. In this mode Vent itself talks to the provider on the user\'s behalf.\n\nVent provides `DEEPGRAM_API_KEY` and `ANTHROPIC_API_KEY` for its hosted caller/evaluation stack \u2014 those are Vent\'s, not the tested agent\'s.\n\n### Credential resolution\n\nIn platform-direct mode the CLI auto-resolves credentials from `.env.local`, `.env`, and the current shell environment. Do not run `source .env && export` before Vent commands. If you include credential fields in JSON, use the actual value, not the env var name. Do not manually author `platform_connection_id`; the CLI creates or updates the saved platform connection automatically.\n\nAuto-resolved env vars and JSON fields:\n\n- Vapi: `VAPI_API_KEY` -> `vapi_api_key`; `VAPI_ASSISTANT_ID` or `VAPI_AGENT_ID` -> `vapi_assistant_id`\n- Bland: `BLAND_API_KEY` -> `bland_api_key`; `BLAND_PATHWAY_ID` -> `bland_pathway_id`; `BLAND_PERSONA_ID` -> `persona_id`\n- LiveKit: `LIVEKIT_API_KEY` -> `livekit_api_key`; `LIVEKIT_API_SECRET` -> `livekit_api_secret`; `LIVEKIT_URL` -> `livekit_url`\n- Retell: `RETELL_API_KEY` -> `retell_api_key`; `RETELL_AGENT_ID` -> `retell_agent_id`\n- ElevenLabs: `ELEVENLABS_API_KEY` -> `elevenlabs_api_key`; `ELEVENLABS_AGENT_ID` -> `elevenlabs_agent_id`\n\n### Provider config\n\nUse existing provider config when possible: Vapi assistant, Retell agent, ElevenLabs agent, Bland pathway, or LiveKit agent. Bland uniquely supports inline config \u2014 `platform` may use `bland_pathway_id`, `persona_id`, or an inline `task` (with optional voice, model, and turn-handling overrides; see Bland\'s API docs for the full field list).\n\n### Concurrency\n\nWhen you fan out multiple Vent calls in parallel against the same provider (for example, running several named calls from one suite at once), respect the provider\'s per-account concurrency limit. Exceeding it makes calls queue or fail at the provider \u2014 Vent does not enforce these caps for you.\n\nRecord the limit as `max_concurrency` in the suite\'s `platform` block so it\'s visible on future runs. Ask the user which plan they\'re on if sizing matters; otherwise use the conservative default in bold.\n\n- **Vapi**: **10** included per account; reserved lines can be purchased self-serve; Enterprise is unlimited.\n- **Retell**: Pay-as-you-go includes **20**; Enterprise has no cap.\n- **Bland**: Start=**10**, Build=50, Scale=100, Enterprise=unlimited.\n- **ElevenLabs**: Free=**4**, Starter=6, Creator=10, Pro=20, Scale=30, Business=30. Burst pricing can temporarily allow up to 3x base.\n- **LiveKit Cloud**: Build=**5**, Ship=20, Scale=50 managed inference sessions (the usual gate for voice agents); agent-session concurrency can go higher (Scale up to 600).\n\n## WebSocket\n\nFor `adapter: "websocket"`, Vent sends binary 16-bit mono PCM audio over one websocket connection. Websocket text frames are optional JSON events. Audio-only websocket agents still work, but events improve turn detection and observability. Vent sends `{"type":"end-call"}` when the test is done.\n\nUseful websocket text frames:\n\n```jsonc\n{"type":"speech-update","status":"started"}\n{"type":"speech-update","status":"stopped"}\n{"type":"tool_call","name":"check_availability","arguments":{},"result":{},"successful":true,"duration_ms":150}\n{"type":"vent:timing","stt_ms":120,"llm_ms":450,"tts_ms":80}\n{"type":"vent:session","platform":"custom","provider_call_id":"call_123","provider_session_id":"session_abc"}\n{"type":"vent:call-metadata","call_metadata":{"recording_url":"https://...","cost_usd":0.12}}\n{"type":"vent:transcript","role":"caller","text":"I need to reschedule","turn_index":0}\n{"type":"vent:transfer","destination":"+15551234567","status":"attempted"}\n{"type":"vent:debug-url","label":"trace","url":"https://..."}\n{"type":"vent:warning","message":"provider warning","code":"provider_warning"}\n```\n\n`vent:session-report` is **not** handled by the websocket adapter \u2014 it\'s only consumed by the LiveKit helper. Do not emit it from a websocket agent.\n\nPlatform adapters capture tool calls automatically. Websocket agents must emit `tool_call` frames for tool observability. Platform adapters get component latency automatically. Websocket agents should emit `vent:timing` after each agent response when STT/LLM/TTS breakdown is available.\n\n## LiveKit\n\nBefore running LiveKit tests, install and add the Vent helper to the LiveKit agent entrypoint. Node: `npm install @vent-hq/livekit`, then call `instrumentLiveKitAgent({ ctx, session })`. Python: `pip install vent-livekit`, then call `instrument_livekit_agent(ctx=ctx, session=session)`.\n\nLiveKit direct mode requires the LiveKit Agents SDK. Custom LiveKit participants should use the websocket adapter with a relay. If the LiveKit agent registered with an explicit dispatch name, set `livekit_agent_name` in `platform`.\n\nLiveKit parallel calls are supported, but capacity depends on where the user runs their agent. LiveKit Cloud only routes audio \u2014 the agent code runs on the user\'s own infrastructure (laptop, Fly.io, Railway, k8s, etc.), and that box\'s CPU/RAM is the real bottleneck. Before firing parallel calls, derive `platform.max_concurrency` from the minimum of: (1) the user\'s LiveKit plan limit (Build=5, Ship=20, Scale=50+ concurrent agent sessions), (2) their agent worker box capacity (rough rule: 1c/1GB\u21921\u20133 jobs, 2c/4GB\u21925\u201310, 4c/8GB\u219210\u201325, 8c/16GB\u219225\u201350; multiply by number of worker boxes), and (3) their STT/TTS provider concurrency (e.g. Deepgram TTS streaming = 2). If the user wants more parallel calls than that minimum allows, push back \u2014 explain which limit is the bottleneck (plan / box / provider) and either reduce the count or suggest scaling that limit. Architecture: register one `agent_name` and run N workers under it (one big worker, or several smaller ones \u2014 same name); LiveKit\'s dispatcher round-robins jobs across them with built-in failover. Don\'t use numbered names (`agent-0`, `agent-1`) \u2014 that reinvents what the dispatcher does and breaks failover.\n\nUse the LiveKit helper for observability; do not publish `vent:*` topics manually. Do not hand-roll `vent:session-report` from `ctx.addShutdownCallback`; after `room.disconnect()` it can fail with `engine is closed`. The helper captures SDK metrics, tool events, conversation items, usage, and close events. Native LiveKit `lk.transcription` and `lk.agent.state` provide transcript and agent-state timing.\n\n## Output\n\n### Live result\n\n`npx vent-hq run` returns a single JSON result on stdout in non-TTY mode (not an SSE JSONL stream). Exit codes: `0` = call ran through the pipeline; `1` = pipeline-level failure; `2` = harness error.\n\nMost result fields are always present; `latency`, `component_latency`, `call_metadata`, and `emotion` may be `null` when the underlying analysis didn\'t run; `debug` is absent without `--verbose`. Branch on null before reading nested fields. Use `--verbose` only when the default doesn\'t explain a failure \u2014 when you need `platform_transcript` (to check Vent\'s STT), per-turn or component-level latency breakdowns, the raw tool-call timeline, or provider-native artifacts in `debug.provider_metadata`. Otherwise skip \u2014 it just adds noise.\n\nVent\'s transcript is ground truth. Judge on semantic intent: ignore homophones and minor mis-hears (`"check teach hat"` for `"check that"`, missing question marks on short tails) \u2014 those are streaming-STT noise on Vent\'s caller side, not agent bugs, and **don\'t surface them in the report** (they\'re Vent-side artifacts, not actionable for the user). But clear gibberish or word-soup (e.g. `"Cristoxin"` where the agent should have said `"Of course, talk soon"`) is **not** a Vent artifact \u2014 Vent\'s STT does not invent words like that. It means the platform\'s TTS produced corrupted audio or the agent\'s STT/LLM generated the wrong text, and the fix lives there (TTS voice config, agent prompt, model temperature, codec issue). Never dismiss the run as a "Vent harness STT" issue; iterate on the agent or flag the platform.\n\nFor transfers: `call_metadata.transfer_attempted` (provider claimed) and `transfer_completed` (Vent-verified) can disagree \u2014 report both. `transfers[]` carries destination, type, and per-attempt status.\n\n### Saved history\n\nAfter every run, Vent writes the full result JSON to `.vent/runs/`. Shape:\n\n```jsonc\n{\n "run_id": "...",\n "timestamp": "2026-04-21T...Z",\n "git_sha": "...",\n "summary": { "calls_total": 2, "total_duration_ms": 12345, "total_cost_usd": 0.01 },\n "call_results": [\n { "name": "happy-path", "status": "completed", "duration_ms": 6123, "transcript": [], "observed_tool_calls": [], "latency": { "response_time_ms": 420, "p95_response_time_ms": 980 }, "call_metadata": { "cost_usd": 0.004 } }\n ]\n}\n```\n\nWhen comparing against a prior run (Workflow step 6), inspect:\n\n- Run-completion status flips: `call_results[i].status` (pipeline-only \u2014 judge mission success from the transcript)\n- Latency: `call_results[i].latency.response_time_ms` (mean) or `latency.p95_response_time_ms` increased >20%\n- Tool calls: count of `call_results[i].observed_tool_calls[].successful` dropped\n- Cost: `summary.total_cost_usd` or `call_results[i].call_metadata.cost_usd` increased >30%\n- Transcript: `call_results[i].transcript` diverged in semantic content (ignore STT noise)\n\n## Reporting Results\n\nBefore reporting, read the agent\'s code to locate where the observed behavior originates. If the issue is small and you can fix it, fix it and explain what you did \u2014 don\'t ask permission first.\n\nAdapt the report shape to the call \u2014 a clean pass needs little, a regression with a multi-layer cause needs more. Use a transcript excerpt when it helps the user see what happened.\n\nHard rules:\n\n- Pair raw numbers with their plain-English meaning \u2014 don\'t drop the number, but don\'t leave it unexplained. E.g. "p95 latency was 850ms, which is snappy and well within natural conversational pacing" or "p95 hit 1.6 seconds with the LLM as the bottleneck \u2014 noticeably sluggish to a caller."\n- Name the user\'s voice agent by platform on first mention (e.g. "the Vapi agent responded snappily throughout") so the user knows immediately which agent the observation is about. After that, just say "the agent" \u2014 don\'t repeat the platform name on every line.\n- Always include the recording from `call_metadata.recording_url` as an inline `[Recording](url)` link, placed in **one block at the very end of the report** \u2014 never sprinkled through the prose. Single call: one link as the last line. Multi-call: one labeled link per call (e.g. `reschedule-appointment: [Recording](url)`). Never paste a bare URL.\n- Mission success is your judgment, not Vent\'s. The per-call `status` is only `"completed"` (pipeline ran) or `"error"` (pipeline failed); decide whether the agent actually accomplished the scenario from the transcript and tool calls.\n- Similar-sounding word substitutions (e.g. "ocean" for "OSHA") are STT ambiguity, not comprehension failure. The fix lives in STT keyword hints, custom vocabulary, or a prompt-level term list \u2014 not the agent\'s reasoning.\n- Surface only what the user can act on in their own agent\'s code or config \u2014 never `warnings[]` (infrastructure noise), Vent-side artifacts (caller wait modes, harness timing, internal pipeline quirks), or `cost_usd` unless asked.\n\nFor multi-call runs, lead with your own judgment of what happened across the calls (e.g. "3 of 4 did what they were supposed to; `cancel-appointment` never actually canceled"), not a parroted pass/fail count. Then cover each call with whatever depth it needs.\n';
5871
+
5872
+ // src/skills/windsurf.md
5873
+ var windsurf_default = '---\nname: vent\ndescription: Place real calls against the user\'s voice agent to verify recent code changes worked end-to-end. Trigger on test requests or after any non-trivial voice-agent change.\n---\n\n# Vent - Voice Agent Calls\n\n## When to Test\n\nVent is the voice CI gate. Auto-trigger a call after any non-trivial voice-affecting change \u2014 don\'t wait to be asked. Non-trivial means anything that affects what the caller hears, how the agent responds, what tools are called, how the call routes, or audio timing: prompt rewrites or sections (not one-word tweaks), tool schema or handler changes, state/memory logic, transfer logic, provider/audio config, platform integration, and any voice-behavior bug fix. One-line copy or comment edits don\'t need a call.\n\nBefore launching, scan `.vent/runs/` for unresolved issues from prior runs (failing status, regressions, warnings, latency jumps); fix in-scope ones as part of this change and verify they\'re resolved by the new run, surface out-of-scope ones rather than silently leaving them. The task is not done until you\'ve reviewed the transcript, tool calls, and observed behavior and judged that the change works \u2014 Vent\'s `status` and exit code only reflect pipeline completion, not mission success.\n\n## Autonomous Iteration\n\n**Default: iterate.** Run a fix \u2192 Vent-call \u2192 fix \u2192 Vent-call loop whenever a call surfaces a fixable issue. Don\'t report a failure (or a degraded-quality success) back to the user before attempting to fix it \u2014 they want the agent working, not a diagnosis. After each call, if the result reveals a problem you can fix with a concrete plan, apply the fix and run again. Fixable issues include: tool schema or registry gaps, missing handlers, prompt phrasing, **STT vocabulary mismatches** (e.g. domain terms misheard \u2014 Deepgram `keyterms`, custom vocabulary, prompt-level term lists), **latency tunables** (endpointing thresholds, preemptive generation, model selection), and audio quality config. If you\'d be guessing at the next attempt, stop and ask the user instead of trying.\n\n**"Mission success but with quality problems" still requires iteration.** If the call ran end-to-end but had noticeably bad latency, STT mishearing the agent\'s domain vocabulary, audio artifacts, or any other caller-experience degradation that has a concrete fix, the call is *not done* \u2014 apply the fix and re-run. Don\'t downgrade fixable problems to "observations" or "things worth flagging." The pipeline running cleanly is the floor, not the ceiling.\n\n**Never ask permission to iterate.** "Want me to apply this and re-run?" / "Should I fix it?" / "Let me know if you want me to..." \u2014 these are skill violations. If you have a concrete fix, apply it and run the next call. The user opts out by saying so explicitly; silence is not opt-out.\n\nStop iterating and report when one of:\n\n- The next call confirms the fix worked AND no further fixable issues remain (success \u2014 done).\n- The same fix fails twice (your hypothesis is wrong; ask the user).\n- The failure mode shifts between attempts (you\'re chasing a moving target).\n- You can\'t justify the next call against its provider cost \u2014 each call spends real money and provider quota.\n\nIf the user explicitly opts out ("just report", "don\'t iterate", "stop after the first call", "I\'ll fix it myself"), respect that \u2014 run once, report, stop. Otherwise iterate by default. The first time you start an autonomous loop in a session, mention it once so the user knows it\'s happening (e.g. "the call surfaced X \u2014 fixing and re-running; I\'ll report back when it converges or stops making progress").\n\n## Windsurf Execution\n\nVent calls typically take 30 seconds to 2 minutes. Each `vent-hq run` is one shell tool call \u2014 wait for stdout (the JSON result) before responding. Don\'t background; use the JSON returned by `npx vent-hq run` directly. If Cascade\'s auto-execution level prompts for approval on `npx vent-hq` commands, ask the user to add `npx vent-hq *` to the workspace allow list once so subsequent calls flow without interruption.\n\nCascade runs shell tool calls in parallel within a turn \u2014 for multiple calls from one suite, issue each named call as its own separate shell tool call in the same turn (do not combine them with `&` and `wait`):\n\n```bash\nnpx vent-hq run -f .vent/suite.vapi.json --call happy-path\nnpx vent-hq run -f .vent/suite.vapi.json --call tool-path\n```\n\nStay within Cascade\'s per-turn parallel-tool-call budget \u2014 fan out at most ~6 calls in one turn and respect the provider concurrency caps below.\n\n## Workflow\n\n1. Identify the behavior under test. Read enough of the agent codebase to understand its system prompt, tools, handlers, routes, provider config, platform wiring, and expected handoffs.\n2. Reuse an existing `.vent/suite.<adapter>.json` when possible. If `.vent/` contains multiple suites, inspect `connection.adapter` and report which suite file produced the result.\n3. Create or update a suite only when the existing calls do not cover the changed behavior. Name calls after real flows, for example `reschedule-appointment`, not `call-1`.\n4. If the suite uses `start_command`, start one shared local session first with `npx vent-hq agent start -f .vent/suite.<adapter>.json`, then pass `--session <session-id>` to each run.\n\n **For locally-run LiveKit agents: every run requires killing *all* workers, starting one fresh worker, and waiting a full 60 seconds before submitting.** Unconditional \u2014 LiveKit Cloud round-robins across registered workers, so a single survivor with a dead inference subprocess fails ~N-1 of N calls. Don\'t rely on `pkill -f <path-pattern>`; bare command lines like `node --import tsx agent.ts dev` won\'t match a path filter. Use `ps aux | grep -E "node.*agent\\.ts|@livekit/agents.*ipc"`, `kill -9` by PID, re-run `ps` to confirm zero survivors, then start the fresh worker. Skipping the 60s wait fails with `did not publish audio track`; if that error appears alongside `Error [ERR_IPC_CHANNEL_CLOSED] from InferenceProcExecutor.doInference` in the agent log right after a "running EOU detection" line, that\'s a straggler \u2014 redo the kill sweep. Hosted LiveKit Cloud agents don\'t need any of this; run normally.\n5. Pick which call(s) to run based on the change. Fixed bug: replay the failing scenario. Changed tool: include a call that triggers that tool. Prompt or routing change: include the relevant happy path and any important edge path.\n6. Compare against the previous JSON in `.vent/runs/` when validating a fix or regression. Check status flips, latency jumps, tool-call success drops, cost jumps, and transcript divergence. Correlate with `git diff` between saved `git_sha` values when available; skip if no previous run exists.\n\n## Commands\n\n```bash\nnpx vent-hq init # First-time setup (auth + skill install + starter suite)\nnpx vent-hq login # Log in to existing account\nnpx vent-hq login --status # Print whether credentials are present\nnpx vent-hq logout # Remove saved credentials from ~/.vent/credentials\nnpx vent-hq run -f .vent/suite.X.json # Run a single-call suite\nnpx vent-hq run -f .vent/suite.X.json --call NAME # Run one named call from a multi-call suite\nnpx vent-hq run ... --session <session-id> # Add to any run; routes through an existing local relay session\nnpx vent-hq run ... --verbose # Add to any run; include verbose debug fields\nnpx vent-hq stop <run-id> # Cancel a queued or running run\nnpx vent-hq agent start -f .vent/suite.X.json # Start a shared local relay session\nnpx vent-hq agent stop <session-id> # Stop a shared local relay session\n```\n\nIf `~/.vent/credentials` is missing and `VENT_ACCESS_TOKEN` is not set, run `npx vent-hq init`. For an existing account, run `npx vent-hq login` or set `VENT_ACCESS_TOKEN`.\n\n## Suite Config\n\nSuites live in `.vent/suite.<adapter>.json`. `connection` is declared once per suite. `calls` is a named map, and each key becomes the call name used with `--call`.\n\nLocal websocket suite:\n\n```json\n{\n "connection": {\n "adapter": "websocket",\n "start_command": "npm run start",\n "health_endpoint": "/health",\n "agent_port": 3001\n },\n "calls": {\n "happy-path": {\n "caller_prompt": "You are Maria calling to reschedule her appointment to next Tuesday.",\n "max_turns": 8,\n "silence_threshold_ms": 1200\n }\n }\n}\n```\n\nPlatform-direct suite:\n\n```json\n{\n "connection": {\n "adapter": "vapi",\n "platform": { "provider": "vapi" }\n },\n "calls": {\n "happy-path": {\n "caller_prompt": "You are Maria calling to reschedule her appointment to next Tuesday.",\n "max_turns": 8\n }\n }\n}\n```\n\nWrite `caller_prompt` as a realistic caller with a name, goal, mood, constraints, and conditional behavior. Set `max_turns` based on flow complexity: FAQ `4-6`, booking or tool use `8-12`, complex flows `12-20`.\n\nCall fields:\n\n- `caller_prompt` and `max_turns` are required.\n- `silence_threshold_ms` must be `200-10000`. Common ranges: FAQ `800-1200`, tool calls `2000-3000`, complex reasoning `3000-5000`.\n- `persona` supports `pace`, `clarity`, `disfluencies`, `cooperation`, `emotion`, `memory`, `intent_clarity`, and `confirmation_style`.\n- `caller_audio` supports noise, speed, speakerphone, mic distance, clarity, accent, packet loss, and jitter.\n- `language` is an ISO 639-1 code such as `en`, `es`, `fr`, `de`, `it`, `nl`, or `ja`.\n- `voice` is `"male"` or `"female"` (English only; default female). Use to flip the caller\'s perceived gender. Ignored if `caller_audio.accent` is set or `language` is non-English.\n\n## Connections and Credentials\n\n### Adapter choice\n\nUse `websocket` for your own local or hosted runtime. Use `start_command` for local agents or `agent_url` for hosted custom endpoints. For `start_command` and `agent_url`, do not put Deepgram, ElevenLabs, OpenAI, or other agent runtime keys into Vent config unless the Vent adapter itself needs them \u2014 the tested agent owns its own runtime credentials.\n\nUse `vapi`, `retell`, `elevenlabs`, `bland`, or `livekit` for platform-direct testing. In this mode Vent itself talks to the provider on the user\'s behalf.\n\nVent provides `DEEPGRAM_API_KEY` and `ANTHROPIC_API_KEY` for its hosted caller/evaluation stack \u2014 those are Vent\'s, not the tested agent\'s.\n\n### Credential resolution\n\nIn platform-direct mode the CLI auto-resolves credentials from `.env.local`, `.env`, and the current shell environment. Do not run `source .env && export` before Vent commands. If you include credential fields in JSON, use the actual value, not the env var name. Do not manually author `platform_connection_id`; the CLI creates or updates the saved platform connection automatically.\n\nAuto-resolved env vars and JSON fields:\n\n- Vapi: `VAPI_API_KEY` -> `vapi_api_key`; `VAPI_ASSISTANT_ID` or `VAPI_AGENT_ID` -> `vapi_assistant_id`\n- Bland: `BLAND_API_KEY` -> `bland_api_key`; `BLAND_PATHWAY_ID` -> `bland_pathway_id`; `BLAND_PERSONA_ID` -> `persona_id`\n- LiveKit: `LIVEKIT_API_KEY` -> `livekit_api_key`; `LIVEKIT_API_SECRET` -> `livekit_api_secret`; `LIVEKIT_URL` -> `livekit_url`\n- Retell: `RETELL_API_KEY` -> `retell_api_key`; `RETELL_AGENT_ID` -> `retell_agent_id`\n- ElevenLabs: `ELEVENLABS_API_KEY` -> `elevenlabs_api_key`; `ELEVENLABS_AGENT_ID` -> `elevenlabs_agent_id`\n\n### Provider config\n\nUse existing provider config when possible: Vapi assistant, Retell agent, ElevenLabs agent, Bland pathway, or LiveKit agent. Bland uniquely supports inline config \u2014 `platform` may use `bland_pathway_id`, `persona_id`, or an inline `task` (with optional voice, model, and turn-handling overrides; see Bland\'s API docs for the full field list).\n\n### Concurrency\n\nWhen you fan out multiple Vent calls in parallel against the same provider (for example, running several named calls from one suite at once), respect the provider\'s per-account concurrency limit. Exceeding it makes calls queue or fail at the provider \u2014 Vent does not enforce these caps for you.\n\nRecord the limit as `max_concurrency` in the suite\'s `platform` block so it\'s visible on future runs. Ask the user which plan they\'re on if sizing matters; otherwise use the conservative default in bold.\n\n- **Vapi**: **10** included per account; reserved lines can be purchased self-serve; Enterprise is unlimited.\n- **Retell**: Pay-as-you-go includes **20**; Enterprise has no cap.\n- **Bland**: Start=**10**, Build=50, Scale=100, Enterprise=unlimited.\n- **ElevenLabs**: Free=**4**, Starter=6, Creator=10, Pro=20, Scale=30, Business=30. Burst pricing can temporarily allow up to 3x base.\n- **LiveKit Cloud**: Build=**5**, Ship=20, Scale=50 managed inference sessions (the usual gate for voice agents); agent-session concurrency can go higher (Scale up to 600).\n\n## WebSocket\n\nFor `adapter: "websocket"`, Vent sends binary 16-bit mono PCM audio over one websocket connection. Websocket text frames are optional JSON events. Audio-only websocket agents still work, but events improve turn detection and observability. Vent sends `{"type":"end-call"}` when the test is done.\n\nUseful websocket text frames:\n\n```jsonc\n{"type":"speech-update","status":"started"}\n{"type":"speech-update","status":"stopped"}\n{"type":"tool_call","name":"check_availability","arguments":{},"result":{},"successful":true,"duration_ms":150}\n{"type":"vent:timing","stt_ms":120,"llm_ms":450,"tts_ms":80}\n{"type":"vent:session","platform":"custom","provider_call_id":"call_123","provider_session_id":"session_abc"}\n{"type":"vent:call-metadata","call_metadata":{"recording_url":"https://...","cost_usd":0.12}}\n{"type":"vent:transcript","role":"caller","text":"I need to reschedule","turn_index":0}\n{"type":"vent:transfer","destination":"+15551234567","status":"attempted"}\n{"type":"vent:debug-url","label":"trace","url":"https://..."}\n{"type":"vent:warning","message":"provider warning","code":"provider_warning"}\n```\n\n`vent:session-report` is **not** handled by the websocket adapter \u2014 it\'s only consumed by the LiveKit helper. Do not emit it from a websocket agent.\n\nPlatform adapters capture tool calls automatically. Websocket agents must emit `tool_call` frames for tool observability. Platform adapters get component latency automatically. Websocket agents should emit `vent:timing` after each agent response when STT/LLM/TTS breakdown is available.\n\n## LiveKit\n\nBefore running LiveKit tests, install and add the Vent helper to the LiveKit agent entrypoint. Node: `npm install @vent-hq/livekit`, then call `instrumentLiveKitAgent({ ctx, session })`. Python: `pip install vent-livekit`, then call `instrument_livekit_agent(ctx=ctx, session=session)`.\n\nLiveKit direct mode requires the LiveKit Agents SDK. Custom LiveKit participants should use the websocket adapter with a relay. If the LiveKit agent registered with an explicit dispatch name, set `livekit_agent_name` in `platform`.\n\nLiveKit parallel calls are supported, but capacity depends on where the user runs their agent. LiveKit Cloud only routes audio \u2014 the agent code runs on the user\'s own infrastructure (laptop, Fly.io, Railway, k8s, etc.), and that box\'s CPU/RAM is the real bottleneck. Before firing parallel calls, derive `platform.max_concurrency` from the minimum of: (1) the user\'s LiveKit plan limit (Build=5, Ship=20, Scale=50+ concurrent agent sessions), (2) their agent worker box capacity (rough rule: 1c/1GB\u21921\u20133 jobs, 2c/4GB\u21925\u201310, 4c/8GB\u219210\u201325, 8c/16GB\u219225\u201350; multiply by number of worker boxes), and (3) their STT/TTS provider concurrency (e.g. Deepgram TTS streaming = 2). If the user wants more parallel calls than that minimum allows, push back \u2014 explain which limit is the bottleneck (plan / box / provider) and either reduce the count or suggest scaling that limit. Architecture: register one `agent_name` and run N workers under it (one big worker, or several smaller ones \u2014 same name); LiveKit\'s dispatcher round-robins jobs across them with built-in failover. Don\'t use numbered names (`agent-0`, `agent-1`) \u2014 that reinvents what the dispatcher does and breaks failover.\n\nUse the LiveKit helper for observability; do not publish `vent:*` topics manually. Do not hand-roll `vent:session-report` from `ctx.addShutdownCallback`; after `room.disconnect()` it can fail with `engine is closed`. The helper captures SDK metrics, tool events, conversation items, usage, and close events. Native LiveKit `lk.transcription` and `lk.agent.state` provide transcript and agent-state timing.\n\n## Output\n\n### Live result\n\n`npx vent-hq run` returns a single JSON result on stdout in non-TTY mode (not an SSE JSONL stream). Exit codes: `0` = call ran through the pipeline; `1` = pipeline-level failure; `2` = harness error.\n\nMost result fields are always present; `latency`, `component_latency`, `call_metadata`, and `emotion` may be `null` when the underlying analysis didn\'t run; `debug` is absent without `--verbose`. Branch on null before reading nested fields. Use `--verbose` only when the default doesn\'t explain a failure \u2014 when you need `platform_transcript` (to check Vent\'s STT), per-turn or component-level latency breakdowns, the raw tool-call timeline, or provider-native artifacts in `debug.provider_metadata`. Otherwise skip \u2014 it just adds noise.\n\nVent\'s transcript is ground truth. Judge on semantic intent: ignore homophones and minor mis-hears (`"check teach hat"` for `"check that"`, missing question marks on short tails) \u2014 those are streaming-STT noise on Vent\'s caller side, not agent bugs, and **don\'t surface them in the report** (they\'re Vent-side artifacts, not actionable for the user). But clear gibberish or word-soup (e.g. `"Cristoxin"` where the agent should have said `"Of course, talk soon"`) is **not** a Vent artifact \u2014 Vent\'s STT does not invent words like that. It means the platform\'s TTS produced corrupted audio or the agent\'s STT/LLM generated the wrong text, and the fix lives there (TTS voice config, agent prompt, model temperature, codec issue). Never dismiss the run as a "Vent harness STT" issue; iterate on the agent or flag the platform.\n\nFor transfers: `call_metadata.transfer_attempted` (provider claimed) and `transfer_completed` (Vent-verified) can disagree \u2014 report both. `transfers[]` carries destination, type, and per-attempt status.\n\n### Saved history\n\nAfter every run, Vent writes the full result JSON to `.vent/runs/`. Shape:\n\n```jsonc\n{\n "run_id": "...",\n "timestamp": "2026-04-21T...Z",\n "git_sha": "...",\n "summary": { "calls_total": 2, "total_duration_ms": 12345, "total_cost_usd": 0.01 },\n "call_results": [\n { "name": "happy-path", "status": "completed", "duration_ms": 6123, "transcript": [], "observed_tool_calls": [], "latency": { "response_time_ms": 420, "p95_response_time_ms": 980 }, "call_metadata": { "cost_usd": 0.004 } }\n ]\n}\n```\n\nWhen comparing against a prior run (Workflow step 6), inspect:\n\n- Run-completion status flips: `call_results[i].status` (pipeline-only \u2014 judge mission success from the transcript)\n- Latency: `call_results[i].latency.response_time_ms` (mean) or `latency.p95_response_time_ms` increased >20%\n- Tool calls: count of `call_results[i].observed_tool_calls[].successful` dropped\n- Cost: `summary.total_cost_usd` or `call_results[i].call_metadata.cost_usd` increased >30%\n- Transcript: `call_results[i].transcript` diverged in semantic content (ignore STT noise)\n\n## Reporting Results\n\nBefore reporting, read the agent\'s code to locate where the observed behavior originates. If the issue is small and you can fix it, fix it and explain what you did \u2014 don\'t ask permission first.\n\nAdapt the report shape to the call \u2014 a clean pass needs little, a regression with a multi-layer cause needs more. Use a transcript excerpt when it helps the user see what happened.\n\nHard rules:\n\n- Pair raw numbers with their plain-English meaning \u2014 don\'t drop the number, but don\'t leave it unexplained. E.g. "p95 latency was 850ms, which is snappy and well within natural conversational pacing" or "p95 hit 1.6 seconds with the LLM as the bottleneck \u2014 noticeably sluggish to a caller."\n- Name the user\'s voice agent by platform on first mention (e.g. "the Vapi agent responded snappily throughout") so the user knows immediately which agent the observation is about. After that, just say "the agent" \u2014 don\'t repeat the platform name on every line.\n- Always include the recording from `call_metadata.recording_url` as an inline `[Recording](url)` link, placed in **one block at the very end of the report** \u2014 never sprinkled through the prose. Single call: one link as the last line. Multi-call: one labeled link per call (e.g. `reschedule-appointment: [Recording](url)`). Never paste a bare URL.\n- Mission success is your judgment, not Vent\'s. The per-call `status` is only `"completed"` (pipeline ran) or `"error"` (pipeline failed); decide whether the agent actually accomplished the scenario from the transcript and tool calls.\n- Similar-sounding word substitutions (e.g. "ocean" for "OSHA") are STT ambiguity, not comprehension failure. The fix lives in STT keyword hints, custom vocabulary, or a prompt-level term list \u2014 not the agent\'s reasoning.\n- Surface only what the user can act on in their own agent\'s code or config \u2014 never `warnings[]` (infrastructure noise), Vent-side artifacts (caller wait modes, harness timing, internal pipeline quirks), or `cost_usd` unless asked.\n\nFor multi-call runs, lead with your own judgment of what happened across the calls (e.g. "3 of 4 did what they were supposed to; `cancel-appointment` never actually canceled"), not a parroted pass/fail count. Then cover each call with whatever depth it needs.\n';
5946
5874
 
5947
5875
  // src/lib/setup.ts
5948
5876
  var SUITE_SCAFFOLD = JSON.stringify(
@@ -5974,6 +5902,12 @@ async function installCursor(cwd) {
5974
5902
  await fs5.writeFile(path3.join(dir, "vent.mdc"), cursor_default);
5975
5903
  printSuccess("Cursor: .cursor/rules/vent.mdc", { force: true });
5976
5904
  }
5905
+ async function installWindsurf(cwd) {
5906
+ const dir = path3.join(cwd, ".windsurf", "skills", "vent");
5907
+ await fs5.mkdir(dir, { recursive: true });
5908
+ await fs5.writeFile(path3.join(dir, "SKILL.md"), windsurf_default);
5909
+ printSuccess("Windsurf: .windsurf/skills/vent/SKILL.md", { force: true });
5910
+ }
5977
5911
  var VENT_MARKERS = [
5978
5912
  "# Vent - Voice Agent Calls",
5979
5913
  "# Vent \u2014 Voice Agent Calls"
@@ -5999,6 +5933,7 @@ async function installSkillsAndScaffold(cwd) {
5999
5933
  await installClaudeCode(cwd);
6000
5934
  await installCursor(cwd);
6001
5935
  await installCodex(cwd);
5936
+ await installWindsurf(cwd);
6002
5937
  const suitePath = path3.join(cwd, ".vent", "suite.json");
6003
5938
  let suiteExists = false;
6004
5939
  try {
@@ -6119,7 +6054,6 @@ Commands:
6119
6054
  agent Manage a shared local agent session
6120
6055
  run Run a call from a suite file
6121
6056
  stop Cancel a queued or running call
6122
- status Check status of a previous run
6123
6057
  login Authenticate via browser
6124
6058
  logout Remove saved credentials
6125
6059
  Options:
@@ -6146,7 +6080,6 @@ Start options:
6146
6080
 
6147
6081
  Stop options:
6148
6082
  vent-hq agent stop <session-id>`;
6149
- var STATUS_USAGE = `Usage: vent-hq status <run-id> [--verbose]`;
6150
6083
  async function main() {
6151
6084
  loadDotenv();
6152
6085
  const args = process.argv.slice(2);
@@ -6156,7 +6089,7 @@ async function main() {
6156
6089
  return 0;
6157
6090
  }
6158
6091
  if (command === "--version" || command === "-v") {
6159
- const pkg = await import("./package-V2MEPDXW.mjs");
6092
+ const pkg = await import("./package-GODDS4TH.mjs");
6160
6093
  console.log(`vent-hq ${pkg.default.version}`);
6161
6094
  return 0;
6162
6095
  }
@@ -6224,26 +6157,6 @@ async function main() {
6224
6157
  console.log(AGENT_USAGE);
6225
6158
  return 2;
6226
6159
  }
6227
- case "status": {
6228
- if (commandArgs.includes("--help") || commandArgs.length === 0) {
6229
- console.log(STATUS_USAGE);
6230
- return 0;
6231
- }
6232
- const { values, positionals } = parseArgs({
6233
- args: commandArgs,
6234
- options: {
6235
- verbose: { type: "boolean", short: "v", default: false }
6236
- },
6237
- allowPositionals: true,
6238
- strict: true
6239
- });
6240
- const runId = positionals[0];
6241
- if (!runId) {
6242
- console.log(STATUS_USAGE);
6243
- return 2;
6244
- }
6245
- return statusCommand({ runId, verbose: values.verbose });
6246
- }
6247
6160
  case "stop": {
6248
6161
  const runId = commandArgs[0];
6249
6162
  if (!runId || commandArgs.includes("--help")) {
@@ -4,7 +4,7 @@ import "./chunk-XYDL7GY6.mjs";
4
4
  // package.json
5
5
  var package_default = {
6
6
  name: "vent-hq",
7
- version: "0.13.0",
7
+ version: "0.13.1",
8
8
  type: "module",
9
9
  description: "Vent CLI \u2014 CI/CD for voice AI agents",
10
10
  bin: {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "vent-hq",
3
- "version": "0.13.0",
3
+ "version": "0.13.1",
4
4
  "type": "module",
5
5
  "description": "Vent CLI — CI/CD for voice AI agents",
6
6
  "bin": {