@pinecall/skills 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +65 -0
- package/build.mjs +204 -0
- package/package.json +29 -0
- package/skills/pinecall-concepts/SKILL.md +41 -0
- package/skills/pinecall-concepts/references/concepts/agents-and-channels.md +155 -0
- package/skills/pinecall-concepts/references/concepts/deployment-topologies.md +120 -0
- package/skills/pinecall-concepts/references/concepts/hot-reload.md +119 -0
- package/skills/pinecall-concepts/references/concepts/philosophy.md +100 -0
- package/skills/pinecall-concepts/references/concepts/server-vs-client-llm.md +119 -0
- package/skills/pinecall-examples/SKILL.md +59 -0
- package/skills/pinecall-examples/references/examples/browser-widget.md +206 -0
- package/skills/pinecall-examples/references/examples/chat-bot.md +184 -0
- package/skills/pinecall-examples/references/examples/headless-agent.md +121 -0
- package/skills/pinecall-examples/references/examples/index.md +183 -0
- package/skills/pinecall-examples/references/examples/multi-channel-bot.md +173 -0
- package/skills/pinecall-examples/references/examples/outbound-dispatch.md +109 -0
- package/skills/pinecall-examples/references/examples/turn-detection.md +150 -0
- package/skills/pinecall-guides/SKILL.md +68 -0
- package/skills/pinecall-guides/references/guides/call-ringing.md +149 -0
- package/skills/pinecall-guides/references/guides/conversation-history.md +377 -0
- package/skills/pinecall-guides/references/guides/dev-mode.md +130 -0
- package/skills/pinecall-guides/references/guides/events.md +677 -0
- package/skills/pinecall-guides/references/guides/human-takeover.md +184 -0
- package/skills/pinecall-guides/references/guides/inbound-voice.md +201 -0
- package/skills/pinecall-guides/references/guides/knowledge-bases.md +166 -0
- package/skills/pinecall-guides/references/guides/live-listening.md +199 -0
- package/skills/pinecall-guides/references/guides/multi-tenant.md +158 -0
- package/skills/pinecall-guides/references/guides/outbound-calls.md +279 -0
- package/skills/pinecall-guides/references/guides/sse-streaming.md +207 -0
- package/skills/pinecall-guides/references/guides/testing-agents.md +272 -0
- package/skills/pinecall-guides/references/guides/tools-and-functions.md +254 -0
- package/skills/pinecall-guides/references/guides/webrtc-browser.md +200 -0
- package/skills/pinecall-guides/references/guides/whatsapp.md +370 -0
- package/skills/pinecall-guides/references/guides/ws-streaming.md +235 -0
- package/skills/pinecall-quickstart/SKILL.md +54 -0
- package/skills/pinecall-quickstart/references/index.md +123 -0
- package/skills/pinecall-quickstart/references/quickstart.md +185 -0
- package/skills/pinecall-reference/SKILL.md +43 -0
- package/skills/pinecall-reference/references/reference/cli.md +578 -0
- package/skills/pinecall-reference/references/reference/events.md +366 -0
- package/skills/pinecall-reference/references/reference/llm-providers.md +263 -0
- package/skills/pinecall-reference/references/reference/rest-api.md +122 -0
- package/skills/pinecall-reference/references/reference/session-limits.md +119 -0
- package/skills/pinecall-reference/references/reference/stt-providers.md +174 -0
- package/skills/pinecall-reference/references/reference/tts-providers.md +149 -0
- package/skills/pinecall-sdk-api/SKILL.md +56 -0
- package/skills/pinecall-sdk-api/references/api/agent.md +328 -0
- package/skills/pinecall-sdk-api/references/api/call.md +324 -0
- package/skills/pinecall-sdk-api/references/api/pinecall.md +186 -0
- package/skills/pinecall-sdk-api/references/api/reply-stream.md +148 -0
- package/skills/pinecall-security/SKILL.md +37 -0
- package/skills/pinecall-security/references/security.md +138 -0
- package/skills/pinecall-web-chat/SKILL.md +38 -0
- package/skills/pinecall-web-chat/references/web/chat/chat-session.md +178 -0
- package/skills/pinecall-web-chat/references/web/chat/overview.md +98 -0
- package/skills/pinecall-web-components/SKILL.md +37 -0
- package/skills/pinecall-web-components/references/web/components/overview.md +128 -0
- package/skills/pinecall-web-voice/SKILL.md +40 -0
- package/skills/pinecall-web-voice/references/web/core/datachannel-protocol.md +149 -0
- package/skills/pinecall-web-voice/references/web/core/overview.md +70 -0
- package/skills/pinecall-web-voice/references/web/core/state-and-phases.md +153 -0
- package/skills/pinecall-web-voice/references/web/core/voice-session.md +279 -0
- package/skills/pinecall-web-widget/SKILL.md +41 -0
- package/skills/pinecall-web-widget/references/web/widget/overview.md +67 -0
- package/skills/pinecall-web-widget/references/web/widget/props.md +291 -0
- package/skills/pinecall-web-widget/references/web/widget/theming.md +131 -0
- package/skills/pinecall-web-widget/references/web/widget/tools-api.md +381 -0
- package/skills/pinecall-web-widget/references/web/widget/use-voice-session-hook.md +130 -0
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: "SSE Event Streaming"
|
|
3
|
+
description: "Stream agent events to your frontend in real time with Server-Sent Events."
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# SSE Event Streaming
|
|
7
|
+
|
|
8
|
+
The SDK can stream every agent event (calls, speech, tools, metrics) to any HTTP client as Server-Sent Events. Build live dashboards, call monitors, or analytics pipelines with a single endpoint.
|
|
9
|
+
|
|
10
|
+
## The basics
|
|
11
|
+
|
|
12
|
+
```typescript
|
|
13
|
+
import express from "express";
|
|
14
|
+
import { Pinecall } from "@pinecall/sdk";
|
|
15
|
+
|
|
16
|
+
const app = express();
|
|
17
|
+
const pc = new Pinecall({ apiKey: process.env.PINECALL_API_KEY! });
|
|
18
|
+
|
|
19
|
+
// Stream all events from all agents
|
|
20
|
+
app.get("/events", (req, res) => pc.stream(res));
|
|
21
|
+
|
|
22
|
+
app.listen(3000);
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Open `http://localhost:3000/events` in a browser or connect with `EventSource` — you'll see every event as it happens.
|
|
26
|
+
|
|
27
|
+
## Two ways to stream
|
|
28
|
+
|
|
29
|
+
### `pc.stream()` — all agents
|
|
30
|
+
|
|
31
|
+
Streams events from every agent on this `Pinecall` instance:
|
|
32
|
+
|
|
33
|
+
```typescript
|
|
34
|
+
// Express / Node.js ServerResponse
|
|
35
|
+
app.get("/events", (req, res) => pc.stream(res));
|
|
36
|
+
|
|
37
|
+
// Hono / Web API Response (no res argument)
|
|
38
|
+
app.get("/events", () => pc.stream());
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### `agent.stream()` — one agent
|
|
42
|
+
|
|
43
|
+
Streams events scoped to a single agent:
|
|
44
|
+
|
|
45
|
+
```typescript
|
|
46
|
+
const mara = pc.agent("mara", { /* ... */ });
|
|
47
|
+
|
|
48
|
+
app.get("/events/mara", (req, res) => mara.stream(res));
|
|
49
|
+
app.get("/events/mara", () => mara.stream());
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### Filtering by agent name
|
|
53
|
+
|
|
54
|
+
If you have multiple agents but only want events from specific ones:
|
|
55
|
+
|
|
56
|
+
```typescript
|
|
57
|
+
app.get("/events", (req, res) => pc.stream(res, { agents: ["mara", "support"] }));
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Event format
|
|
61
|
+
|
|
62
|
+
Each SSE event has an `event:` type and a JSON `data:` body:
|
|
63
|
+
|
|
64
|
+
```
|
|
65
|
+
event: call.started
|
|
66
|
+
data: {"callId":"CA_abc","from":"+34611111111","agent":"mara","direction":"inbound"}
|
|
67
|
+
|
|
68
|
+
event: user.message
|
|
69
|
+
data: {"callId":"CA_abc","text":"Hola, quiero reservar","messageId":"msg_1","agent":"mara"}
|
|
70
|
+
|
|
71
|
+
event: bot.speaking
|
|
72
|
+
data: {"callId":"CA_abc","text":"¡Claro! ¿Para cuándo?","messageId":"msg_2","agent":"mara"}
|
|
73
|
+
|
|
74
|
+
event: call.ended
|
|
75
|
+
data: {"callId":"CA_abc","reason":"hangup","duration":45,"agent":"mara"}
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
A `:ping` comment is sent every 30 seconds as a keepalive to prevent proxy timeouts.
|
|
79
|
+
|
|
80
|
+
## Connecting from the browser
|
|
81
|
+
|
|
82
|
+
Use the native `EventSource` API:
|
|
83
|
+
|
|
84
|
+
```typescript
|
|
85
|
+
const events = new EventSource("/events");
|
|
86
|
+
|
|
87
|
+
events.addEventListener("call.started", (e) => {
|
|
88
|
+
const data = JSON.parse(e.data);
|
|
89
|
+
console.log(`New call from ${data.from} on agent ${data.agent}`);
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
events.addEventListener("user.message", (e) => {
|
|
93
|
+
const data = JSON.parse(e.data);
|
|
94
|
+
addToTranscript("user", data.text);
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
events.addEventListener("bot.speaking", (e) => {
|
|
98
|
+
const data = JSON.parse(e.data);
|
|
99
|
+
addToTranscript("bot", data.text);
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
events.addEventListener("call.ended", (e) => {
|
|
103
|
+
const data = JSON.parse(e.data);
|
|
104
|
+
console.log(`Call ended: ${data.reason} (${data.duration}s)`);
|
|
105
|
+
});
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
`EventSource` auto-reconnects on disconnect — no manual retry logic needed.
|
|
109
|
+
|
|
110
|
+
## Building a live call monitor
|
|
111
|
+
|
|
112
|
+
A minimal React component that shows active calls:
|
|
113
|
+
|
|
114
|
+
```tsx
|
|
115
|
+
function CallMonitor() {
|
|
116
|
+
const [calls, setCalls] = useState(new Map());
|
|
117
|
+
|
|
118
|
+
useEffect(() => {
|
|
119
|
+
const es = new EventSource("/events");
|
|
120
|
+
|
|
121
|
+
es.addEventListener("call.started", (e) => {
|
|
122
|
+
const d = JSON.parse(e.data);
|
|
123
|
+
setCalls((prev) => new Map(prev).set(d.callId, {
|
|
124
|
+
from: d.from, agent: d.agent, started: Date.now(), transcript: [],
|
|
125
|
+
}));
|
|
126
|
+
});
|
|
127
|
+
|
|
128
|
+
es.addEventListener("user.message", (e) => {
|
|
129
|
+
const d = JSON.parse(e.data);
|
|
130
|
+
setCalls((prev) => {
|
|
131
|
+
const next = new Map(prev);
|
|
132
|
+
const call = next.get(d.callId);
|
|
133
|
+
if (call) call.transcript.push({ role: "user", text: d.text });
|
|
134
|
+
return next;
|
|
135
|
+
});
|
|
136
|
+
});
|
|
137
|
+
|
|
138
|
+
es.addEventListener("call.ended", (e) => {
|
|
139
|
+
const d = JSON.parse(e.data);
|
|
140
|
+
setCalls((prev) => { const next = new Map(prev); next.delete(d.callId); return next; });
|
|
141
|
+
});
|
|
142
|
+
|
|
143
|
+
return () => es.close();
|
|
144
|
+
}, []);
|
|
145
|
+
|
|
146
|
+
return (
|
|
147
|
+
<div>
|
|
148
|
+
<h2>Active Calls ({calls.size})</h2>
|
|
149
|
+
{[...calls.entries()].map(([id, call]) => (
|
|
150
|
+
<div key={id}>
|
|
151
|
+
<strong>{call.agent}</strong> — {call.from}
|
|
152
|
+
{call.transcript.map((t, i) => <p key={i}>{t.role}: {t.text}</p>)}
|
|
153
|
+
</div>
|
|
154
|
+
))}
|
|
155
|
+
</div>
|
|
156
|
+
);
|
|
157
|
+
}
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## When SSE works (and when it doesn't)
|
|
161
|
+
|
|
162
|
+
SSE streaming requires the agent and the HTTP endpoint to run in the **same process**. This is the key architectural constraint:
|
|
163
|
+
|
|
164
|
+
| Topology | SSE works? | Why |
|
|
165
|
+
|---|---|---|
|
|
166
|
+
| Agent + web server in one process | ✅ | `pc.stream()` has direct access to events |
|
|
167
|
+
| Agent in a separate process (Docker, serverless) | ❌ | Events never reach the web server process |
|
|
168
|
+
|
|
169
|
+
If you run agents in separate containers (the "separated" topology), you'd need a shared event bus (Redis Pub/Sub, NATS) between the agent and the dashboard. See [Deployment Topologies](/concepts/deployment-topologies).
|
|
170
|
+
|
|
171
|
+
## Framework examples
|
|
172
|
+
|
|
173
|
+
### Express
|
|
174
|
+
|
|
175
|
+
```typescript
|
|
176
|
+
app.get("/events", (req, res) => pc.stream(res));
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
### Hono
|
|
180
|
+
|
|
181
|
+
```typescript
|
|
182
|
+
app.get("/events", () => pc.stream());
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
### Next.js API Route
|
|
186
|
+
|
|
187
|
+
```typescript
|
|
188
|
+
// app/api/events/route.ts
|
|
189
|
+
export async function GET() {
|
|
190
|
+
return pc.stream();
|
|
191
|
+
}
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
### Fastify
|
|
195
|
+
|
|
196
|
+
```typescript
|
|
197
|
+
fastify.get("/events", (req, reply) => {
|
|
198
|
+
pc.stream(reply.raw);
|
|
199
|
+
});
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
## What's next
|
|
203
|
+
|
|
204
|
+
- [WebSocket Streaming](/guides/ws-streaming) — bidirectional alternative with session scoping
|
|
205
|
+
- [Events reference](/reference/events) — every event with payload shapes
|
|
206
|
+
- [Deployment Topologies](/concepts/deployment-topologies) — when SSE is available
|
|
207
|
+
- [Multi-tenant](/guides/multi-tenant) — scope streams per customer
|
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: "Testing Agents"
|
|
3
|
+
description: "Automated QA for voice agents using YAML specs and LLM judges."
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
Test your agents with `pinecall test` — define conversation workflows in YAML, and a judge LLM evaluates your agent's behavior automatically.
|
|
7
|
+
|
|
8
|
+
## Quick Start
|
|
9
|
+
|
|
10
|
+
### 1. Create a spec file
|
|
11
|
+
|
|
12
|
+
Create `agent/specs/greeting.spec.yaml` in your agent project:
|
|
13
|
+
|
|
14
|
+
```yaml
|
|
15
|
+
agent: florencia
|
|
16
|
+
description: "Verify the agent greets correctly"
|
|
17
|
+
|
|
18
|
+
workflow: |
|
|
19
|
+
1. Say "Hola"
|
|
20
|
+
2. Verify the agent responds warmly with a greeting
|
|
21
|
+
3. Verify it mentions the business name
|
|
22
|
+
4. PASS if greeting is correct, FAIL if not
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
### 2. Run the test
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pinecall test agent/specs/
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
That's it. The judge LLM will converse with your agent, follow the workflow, and report pass/fail.
|
|
32
|
+
|
|
33
|
+
## How It Works
|
|
34
|
+
|
|
35
|
+
`pinecall test` uses a **judge LLM** (Claude Haiku by default) to test your agent:
|
|
36
|
+
|
|
37
|
+

|
|
38
|
+
|
|
39
|
+
The judge has two tools:
|
|
40
|
+
- **`test_passed(summary)`** — marks the workflow as passed
|
|
41
|
+
- **`test_failed(reason)`** — marks the workflow as failed
|
|
42
|
+
|
|
43
|
+
The judge's text messages are sent directly to your agent as user messages. It acts like a real customer following a script.
|
|
44
|
+
|
|
45
|
+
## Voice Mode
|
|
46
|
+
|
|
47
|
+
By default `pinecall test` talks to your agent over **text** (fast and cheap). To exercise the **real voice pipeline** — STT, turn detection, TTS, barge-in — run the spec as an actual voice call.
|
|
48
|
+
|
|
49
|
+
In voice mode the judge is a **real Pinecall agent** (server-rendered voice) **bridged** to your agent: two agents, two WebSockets, one call. Each side runs the full STT → LLM → TTS → turn-detection pipeline, and neither side knows the other is a bot — to your agent it's a normal inbound call. The CLI records both voices to a WAV, opens a live browser player, and resolves the verdict when the judge calls `test_passed()` / `test_failed()`.
|
|
50
|
+
|
|
51
|
+
### Run it
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
# Force voice mode with the flag…
|
|
55
|
+
pinecall test agent/specs/greeting.spec.yaml --voice
|
|
56
|
+
|
|
57
|
+
# …or declare `mode: voice` in the spec and run it normally
|
|
58
|
+
pinecall test agent/specs/greeting.spec.yaml
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
**Requirements:** `PINECALL_API_KEY` + the judge LLM key (e.g. `ANTHROPIC_API_KEY`). No TTS/STT key needed — the judge's voice is rendered server-side.
|
|
62
|
+
|
|
63
|
+
### Voice spec fields
|
|
64
|
+
|
|
65
|
+
```yaml
|
|
66
|
+
agent: pines
|
|
67
|
+
description: "Pines greets and answers a question (voice)"
|
|
68
|
+
mode: voice # run as a real voice call
|
|
69
|
+
voice: elevenlabs/professional-male # judge's TTS voice (default)
|
|
70
|
+
stt: flux # judge's STT (default: deepgram-flux)
|
|
71
|
+
language: en # optional language override
|
|
72
|
+
greeting: "Hi!" # optional: judge opens the call;
|
|
73
|
+
# omit → your agent greets first, judge waits
|
|
74
|
+
timeout: 45s # voice turns are slower than text
|
|
75
|
+
|
|
76
|
+
workflow: |
|
|
77
|
+
1. Greet the agent and ask what it can help with.
|
|
78
|
+
2. Verify it responds coherently and offers to help.
|
|
79
|
+
3. Ask "What is Pinecall?" and verify the answer is on-topic.
|
|
80
|
+
4. PASS if the conversation was coherent and helpful.
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### Voice CLI flags
|
|
84
|
+
|
|
85
|
+
| Flag | Description |
|
|
86
|
+
|------|-------------|
|
|
87
|
+
| `--voice <p/v>` | Judge's TTS voice (e.g. `elevenlabs/professional-male`). Passing it also forces voice mode. |
|
|
88
|
+
| `--stt <prov>` | Judge's STT (e.g. `flux`). Default: `deepgram-flux`. |
|
|
89
|
+
| `--record <file>` | WAV output path. Default: `<spec>.wav` next to the spec. |
|
|
90
|
+
| `--no-listen` | Don't auto-open the live browser player. |
|
|
91
|
+
| `--lang <code>` | Language override (e.g. `es`). |
|
|
92
|
+
|
|
93
|
+
The judge speaks one short sentence per turn (it behaves like a real caller), the mixed call is recorded to a WAV next to the spec, and the transcript shows both sides per utterance.
|
|
94
|
+
|
|
95
|
+
## Writing Specs
|
|
96
|
+
|
|
97
|
+
### Spec format
|
|
98
|
+
|
|
99
|
+
Specs are YAML files ending in `.spec.yaml` or `.spec.yml`.
|
|
100
|
+
|
|
101
|
+
```yaml
|
|
102
|
+
# Required fields
|
|
103
|
+
agent: florencia # Agent name (as registered with pc.agent())
|
|
104
|
+
workflow: | # Natural language workflow for the judge
|
|
105
|
+
1. Do something
|
|
106
|
+
2. Verify something
|
|
107
|
+
3. PASS or FAIL
|
|
108
|
+
|
|
109
|
+
# Optional fields
|
|
110
|
+
description: "Human-readable title" # Shown in test output
|
|
111
|
+
timeout: 45s # Per-turn timeout (default: 30s)
|
|
112
|
+
|
|
113
|
+
# Optional: override judge model
|
|
114
|
+
judge:
|
|
115
|
+
provider: anthropic # anthropic | openai | google | deepseek | ollama
|
|
116
|
+
model: claude-haiku-4-5 # Model name
|
|
117
|
+
maxTurns: 10 # Max conversation turns (default: 20)
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### Workflow tips
|
|
121
|
+
|
|
122
|
+
The `workflow` field is natural language — the judge LLM interprets it. Write it like instructions for a QA tester:
|
|
123
|
+
|
|
124
|
+
```yaml
|
|
125
|
+
# ✅ Good — clear, actionable steps
|
|
126
|
+
workflow: |
|
|
127
|
+
1. Ask what services are available
|
|
128
|
+
2. Verify the agent lists at least 3 services with prices
|
|
129
|
+
3. Ask to book one of them
|
|
130
|
+
4. Verify the agent calls the booking tool
|
|
131
|
+
5. PASS if booking flow works, FAIL if anything breaks
|
|
132
|
+
|
|
133
|
+
# ❌ Bad — too vague
|
|
134
|
+
workflow: |
|
|
135
|
+
Test if the agent works correctly
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
**Best practices:**
|
|
139
|
+
- Number your steps for clarity
|
|
140
|
+
- Be specific about what "correct" means (dates, tool calls, content)
|
|
141
|
+
- Always end with explicit PASS/FAIL criteria
|
|
142
|
+
- Write in the same language your agent speaks
|
|
143
|
+
|
|
144
|
+
### Verifying tool calls
|
|
145
|
+
|
|
146
|
+
The judge sees your agent's tool calls, so you can verify them:
|
|
147
|
+
|
|
148
|
+
```yaml
|
|
149
|
+
workflow: |
|
|
150
|
+
1. Ask to book a haircut for tomorrow
|
|
151
|
+
2. Verify the agent calls checkAvailability
|
|
152
|
+
3. Verify the date argument is tomorrow's date in YYYY-MM-DD format
|
|
153
|
+
4. PASS if the tool was called with the correct date
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
### Verifying behavior
|
|
157
|
+
|
|
158
|
+
Test what your agent says (or doesn't say):
|
|
159
|
+
|
|
160
|
+
```yaml
|
|
161
|
+
workflow: |
|
|
162
|
+
1. Ask to book on a Sunday
|
|
163
|
+
2. Verify the agent says the business is CLOSED on Sundays
|
|
164
|
+
3. Verify the agent does NOT call checkAvailability
|
|
165
|
+
4. Verify the agent suggests an alternative day
|
|
166
|
+
5. PASS if all conditions met
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
## Judge Providers
|
|
170
|
+
|
|
171
|
+
The judge is the LLM that evaluates your agent. Choose based on cost and reliability:
|
|
172
|
+
|
|
173
|
+
| Provider | Model | Cost (per 1M tokens) | Notes |
|
|
174
|
+
|----------|-------|---------------------|-------|
|
|
175
|
+
| `anthropic` | `claude-haiku-4-5-20251001` | $0.80 in / $4.00 out | Default. Most reliable. |
|
|
176
|
+
| `openai` | `gpt-4.1-nano` | $0.10 in / $0.40 out | **Recommended.** 10x cheaper than Haiku. |
|
|
177
|
+
| `google` | `gemini-2.5-flash` | — | Fast, low cost. |
|
|
178
|
+
| `deepseek` | `deepseek-v4-flash` | $0.14 in / $0.28 out | Cheapest cloud option. |
|
|
179
|
+
| `ollama` | `gemma3:4b` | Free | Local. Requires Ollama running. |
|
|
180
|
+
|
|
181
|
+
Set the judge in the spec file or override with CLI:
|
|
182
|
+
|
|
183
|
+
```bash
|
|
184
|
+
# Override all specs to use OpenAI
|
|
185
|
+
pinecall test agent/specs/ --judge anthropic/claude-haiku-4-5
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
### Environment variables
|
|
189
|
+
|
|
190
|
+
Each provider needs its API key:
|
|
191
|
+
|
|
192
|
+
| Variable | Provider |
|
|
193
|
+
|----------|----------|
|
|
194
|
+
| `ANTHROPIC_API_KEY` | Anthropic (default) |
|
|
195
|
+
| `OPENAI_API_KEY` | OpenAI |
|
|
196
|
+
| `GOOGLE_API_KEY` | Google |
|
|
197
|
+
| `DEEPSEEK_API_KEY` | DeepSeek |
|
|
198
|
+
| `OLLAMA_HOST` | Ollama (default: `http://localhost:11434`) |
|
|
199
|
+
|
|
200
|
+
## CLI Reference
|
|
201
|
+
|
|
202
|
+
```bash
|
|
203
|
+
# Run all specs in a directory
|
|
204
|
+
pinecall test agent/specs/
|
|
205
|
+
|
|
206
|
+
# Run a single spec
|
|
207
|
+
pinecall test agent/specs/date-handling.spec.yaml
|
|
208
|
+
|
|
209
|
+
# Override judge model
|
|
210
|
+
pinecall test agent/specs/ --judge anthropic/claude-haiku-4-5
|
|
211
|
+
|
|
212
|
+
# Override agent name
|
|
213
|
+
pinecall test agent/specs/ --agent dev-berna-florencia
|
|
214
|
+
|
|
215
|
+
# Filter specs by name
|
|
216
|
+
pinecall test agent/specs/ --grep "date"
|
|
217
|
+
|
|
218
|
+
# JSON output (for CI pipelines)
|
|
219
|
+
pinecall test agent/specs/ --json
|
|
220
|
+
|
|
221
|
+
# List specs without running
|
|
222
|
+
pinecall test agent/specs/ --list
|
|
223
|
+
|
|
224
|
+
# Verbose mode (full agent responses)
|
|
225
|
+
pinecall test agent/specs/ --verbose
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
## CI Integration
|
|
229
|
+
|
|
230
|
+
`pinecall test` exits with code `1` when any spec fails, and supports `--json` for machine-readable output:
|
|
231
|
+
|
|
232
|
+
```bash
|
|
233
|
+
# In your CI pipeline
|
|
234
|
+
export PINECALL_API_KEY="pk_..."
|
|
235
|
+
export OPENAI_API_KEY="sk-..."
|
|
236
|
+
|
|
237
|
+
pinecall test agent/specs/ --judge anthropic/claude-haiku-4-5 --json
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
JSON output structure:
|
|
241
|
+
|
|
242
|
+
```json
|
|
243
|
+
{
|
|
244
|
+
"passed": 2,
|
|
245
|
+
"failed": 0,
|
|
246
|
+
"results": [
|
|
247
|
+
{
|
|
248
|
+
"file": "agent/specs/date-handling.spec.yaml",
|
|
249
|
+
"agent": "florencia",
|
|
250
|
+
"passed": true,
|
|
251
|
+
"summary": "All dates verified correctly",
|
|
252
|
+
"turns": [...],
|
|
253
|
+
"durationMs": 4300
|
|
254
|
+
}
|
|
255
|
+
]
|
|
256
|
+
}
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
## Project Structure
|
|
260
|
+
|
|
261
|
+
Recommended layout for your agent project:
|
|
262
|
+
|
|
263
|
+
```
|
|
264
|
+
my-agent/
|
|
265
|
+
├── agent/
|
|
266
|
+
│ ├── index.js # Agent code
|
|
267
|
+
│ └── specs/ # Test specs
|
|
268
|
+
│ ├── greeting.spec.yaml
|
|
269
|
+
│ ├── booking.spec.yaml
|
|
270
|
+
│ └── edge-cases.spec.yaml
|
|
271
|
+
└── package.json
|
|
272
|
+
```
|