incantx 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +53 -29
- package/dist/cli.js +281 -22
- package/dist/cli.js.map +6 -6
- package/dist/index.js +281 -22
- package/dist/index.js.map +6 -6
- package/dist/src/fixture/types.d.ts +65 -4
- package/dist/src/runner/expectations.d.ts +7 -2
- package/dist/src/runner/runFixtureFile.d.ts +2 -1
- package/dist/src/runner/subprocessAgent.d.ts +6 -2
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -50,7 +50,7 @@ incantx path/to/fixtures --judge off
|
|
|
50
50
|
|
|
51
51
|
- A test framework for “agents” that behave like a **single Chat Completions call**: input is `{messages, tools}` and output is the **next assistant message** (which may include `tool_calls`).
|
|
52
52
|
- A fixture format that supports:
|
|
53
|
-
- Starting tests **mid-conversation** via full
|
|
53
|
+
- Starting tests **mid-conversation** via a full transcript.
|
|
54
54
|
- Asserting on the **next assistant message**, including **tool call expectations**.
|
|
55
55
|
- (Planned) executing tools and continuing until the agent returns a final message.
|
|
56
56
|
- LLM-based assertions (“judge”) for fuzzy/semantic checks (implemented via OpenAI in `auto` mode; skipped if no `OPENAI_API_KEY`).
|
|
@@ -91,13 +91,19 @@ Fixture files are YAML and contain:
|
|
|
91
91
|
|
|
92
92
|
- a file-level `agent` config (optional), and
|
|
93
93
|
- a `fixtures[]` list, each with:
|
|
94
|
-
-
|
|
95
|
-
|
|
96
|
-
|
|
94
|
+
- a `transcript[]` list of chat turns, ending with `next_assistant` (required)
|
|
95
|
+
|
|
96
|
+
Transcript entries are one of:
|
|
97
|
+
|
|
98
|
+
- `system: "..."`
|
|
99
|
+
- `user: "..."`
|
|
100
|
+
- `assistant: "..."` or `assistant: { content: "...", tool_calls: [...] }`
|
|
101
|
+
- `tool: { name: "...", tool_call_id: "...", json: {...} }` (or `content: "..."`)
|
|
102
|
+
- `next_assistant: { ...expectations... }` (required, must be last)
|
|
97
103
|
|
|
98
104
|
### Example fixture file
|
|
99
105
|
|
|
100
|
-
See `tests/fixtures/weather.yaml` for a complete working example. The key idea is that
|
|
106
|
+
See `tests/fixtures/weather.yaml` for a complete working example. The key idea is that fixtures read like a chat transcript, and `next_assistant` describes what you expect the agent to return for the next turn.
|
|
101
107
|
|
|
102
108
|
```yaml
|
|
103
109
|
agent:
|
|
@@ -106,42 +112,59 @@ agent:
|
|
|
106
112
|
|
|
107
113
|
fixtures:
|
|
108
114
|
- id: weather-requests-tool
|
|
109
|
-
|
|
110
|
-
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
115
|
+
transcript:
|
|
116
|
+
- system: You are a helpful assistant.
|
|
117
|
+
- user: What's the weather in Dublin?
|
|
118
|
+
- next_assistant:
|
|
119
|
+
content_match: exact
|
|
120
|
+
content: ""
|
|
121
|
+
tool_calls_match: contains
|
|
122
|
+
tool_calls:
|
|
123
|
+
- get_weather: { location: Dublin, unit: c }
|
|
124
|
+
tool_results_match: contains
|
|
125
|
+
tool_results:
|
|
126
|
+
- get_weather: { condition: Rain }
|
|
120
127
|
```
|
|
121
128
|
|
|
122
129
|
## Expectations
|
|
123
130
|
|
|
124
|
-
All expectations apply to the **next assistant message** produced after the runner
|
|
125
|
-
|
|
126
|
-
1. takes `history` (if any),
|
|
127
|
-
2. appends `{ role: "user", content: input }`,
|
|
128
|
-
3. calls the agent once.
|
|
131
|
+
All expectations apply to the **next assistant message** produced after the runner sends the transcript’s messages (everything before `next_assistant`) to the agent once.
|
|
129
132
|
|
|
130
133
|
### Expecting tool calls
|
|
131
134
|
|
|
132
|
-
Use `
|
|
135
|
+
Use `next_assistant.tool_calls` to assert that the next assistant message includes tool calls.
|
|
133
136
|
|
|
134
|
-
- `
|
|
135
|
-
- `
|
|
137
|
+
- `next_assistant.tool_calls_match: contains` (default): each expected tool call must appear somewhere in the returned `tool_calls`; extra tool calls are allowed; order is ignored.
|
|
138
|
+
- `next_assistant.tool_calls_match: exact`: the returned `tool_calls` must match exactly (same length/order and matching entries).
|
|
136
139
|
|
|
137
140
|
Tool call matching details:
|
|
138
141
|
|
|
139
142
|
- `name` matches `tool_calls[].function.name`.
|
|
140
143
|
- `arguments` is a **subset match** against the parsed JSON from `tool_calls[].function.arguments` (which is a JSON string in OpenAI format).
|
|
141
144
|
|
|
145
|
+
Fixture sugar:
|
|
146
|
+
|
|
147
|
+
- `tool_calls` entries can be either `{ name: get_weather, args: {...}, id?: ... }` or `{ get_weather: {...args} }`.
|
|
148
|
+
- `tool_results` entries can be either `{ name: ..., tool_call_id?: ..., content?: ..., content_json?: ... }` or `{ get_weather: {...jsonSubset} }`.
|
|
149
|
+
|
|
150
|
+
### Expecting tool results (agent-provided)
|
|
151
|
+
|
|
152
|
+
If your agent returns `tool_messages` alongside `message` (same JSONL response), use `next_assistant.tool_results` to assert on those `role: "tool"` messages.
|
|
153
|
+
|
|
154
|
+
Note: `tool_messages` are only used for expectations today; incantx does not yet append them into the next call’s `messages` automatically (that’s part of the planned tool-execution loop).
|
|
155
|
+
|
|
156
|
+
Tool result matching details:
|
|
157
|
+
|
|
158
|
+
- Match by `tool_call_id` and/or `name`.
|
|
159
|
+
- `content_json` is a subset match against parsed JSON from `tool_messages[].content`.
|
|
160
|
+
|
|
161
|
+
### Expecting assistant content
|
|
162
|
+
|
|
163
|
+
Use `next_assistant.content` with `next_assistant.content_match: contains|exact` for deterministic checks.
|
|
164
|
+
|
|
142
165
|
### Expecting assistant content (LLM-judged)
|
|
143
166
|
|
|
144
|
-
Use `
|
|
167
|
+
Use `next_assistant.llm` to express the intended outcome in natural language.
|
|
145
168
|
|
|
146
169
|
The CLI can grade this using an LLM judge. By default (`--judge auto`), it will only run if `OPENAI_API_KEY` is set; otherwise these checks are marked `SKIP`.
|
|
147
170
|
|
|
@@ -166,11 +189,11 @@ For local, language-agnostic agents, the runner will spawn a subprocess and comm
|
|
|
166
189
|
}
|
|
167
190
|
```
|
|
168
191
|
|
|
169
|
-
#### How
|
|
192
|
+
#### How messages are passed
|
|
170
193
|
|
|
171
|
-
|
|
194
|
+
The runner sends the full conversation so far in `messages` **on every call** (Chat Completions style). This is what makes “start tests mid-conversation” possible: fixtures provide a transcript that becomes the `messages` array.
|
|
172
195
|
|
|
173
|
-
When tools are involved,
|
|
196
|
+
When tools are involved, `messages` typically includes:
|
|
174
197
|
|
|
175
198
|
1. an assistant message containing `tool_calls`, then
|
|
176
199
|
2. one or more `role: "tool"` messages containing tool results (each with `tool_call_id`), then
|
|
@@ -209,7 +232,8 @@ Example (abridged):
|
|
|
209
232
|
|
|
210
233
|
```json
|
|
211
234
|
{
|
|
212
|
-
"message": { "role": "assistant", "content": "hello", "tool_calls": [] }
|
|
235
|
+
"message": { "role": "assistant", "content": "hello", "tool_calls": [] },
|
|
236
|
+
"tool_messages": []
|
|
213
237
|
}
|
|
214
238
|
```
|
|
215
239
|
|
package/dist/cli.js
CHANGED
|
@@ -44,24 +44,207 @@ function loadFixtureFile(yamlText, env = process.env) {
|
|
|
44
44
|
if (!Array.isArray(file.fixtures))
|
|
45
45
|
throw new Error("Fixture file must contain `fixtures: [...]`.");
|
|
46
46
|
const normalized = {
|
|
47
|
-
fixtures:
|
|
47
|
+
fixtures: []
|
|
48
48
|
};
|
|
49
49
|
if (file.agent)
|
|
50
50
|
normalized.agent = normalizeAgentSpec(file.agent, env);
|
|
51
|
-
|
|
51
|
+
function normalizeToolCallSource(entry, where) {
|
|
52
|
+
if (!entry || typeof entry !== "object")
|
|
53
|
+
throw new Error(`${where} must be an object.`);
|
|
54
|
+
if ("name" in entry) {
|
|
55
|
+
const e = entry;
|
|
56
|
+
return {
|
|
57
|
+
id: e.id !== undefined ? String(e.id) : undefined,
|
|
58
|
+
name: String(e.name),
|
|
59
|
+
args: e.args !== undefined ? e.args : undefined
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
const keys = Object.keys(entry);
|
|
63
|
+
if (keys.length !== 1)
|
|
64
|
+
throw new Error(`${where} must be { name: ..., args: ... } or { toolName: { ...args } }.`);
|
|
65
|
+
const name = keys[0];
|
|
66
|
+
const args = entry[name];
|
|
67
|
+
if (!args || typeof args !== "object")
|
|
68
|
+
throw new Error(`${where}.${name} must be an object of args.`);
|
|
69
|
+
return { name, args };
|
|
70
|
+
}
|
|
71
|
+
function normalizeToolCallsForMessage(toolCalls, callId) {
|
|
72
|
+
if (!toolCalls || toolCalls.length === 0)
|
|
73
|
+
return;
|
|
74
|
+
const out = [];
|
|
75
|
+
for (let i = 0;i < toolCalls.length; i++) {
|
|
76
|
+
const tc = normalizeToolCallSource(toolCalls[i], `tool_calls[${i}]`);
|
|
77
|
+
out.push({
|
|
78
|
+
id: callId(tc.id),
|
|
79
|
+
type: "function",
|
|
80
|
+
function: {
|
|
81
|
+
name: tc.name,
|
|
82
|
+
arguments: JSON.stringify(tc.args ?? {})
|
|
83
|
+
}
|
|
84
|
+
});
|
|
85
|
+
}
|
|
86
|
+
return out;
|
|
87
|
+
}
|
|
88
|
+
function normalizeTranscriptTool(entry, where, inferToolCallId) {
|
|
89
|
+
if (!entry || typeof entry !== "object")
|
|
90
|
+
throw new Error(`${where} must be an object.`);
|
|
91
|
+
const name = String(entry.name);
|
|
92
|
+
let tool_call_id = entry.tool_call_id !== undefined ? String(entry.tool_call_id) : undefined;
|
|
93
|
+
if (!tool_call_id)
|
|
94
|
+
tool_call_id = inferToolCallId();
|
|
95
|
+
if ("json" in entry && entry.json !== undefined) {
|
|
96
|
+
return { role: "tool", tool_call_id, name, content: JSON.stringify(entry.json) };
|
|
97
|
+
}
|
|
98
|
+
if ("content" in entry && entry.content !== undefined) {
|
|
99
|
+
return { role: "tool", tool_call_id, name, content: String(entry.content) };
|
|
100
|
+
}
|
|
101
|
+
throw new Error(`${where} must include either 'json' or 'content'.`);
|
|
102
|
+
}
|
|
103
|
+
function normalizeNextAssistantExpect(expect, where) {
|
|
104
|
+
if (!expect || typeof expect !== "object")
|
|
105
|
+
throw new Error(`${where} must be an object.`);
|
|
106
|
+
const tool_calls = expect.tool_calls;
|
|
107
|
+
const tool_results = expect.tool_results;
|
|
108
|
+
const out = {};
|
|
109
|
+
const assistant = {};
|
|
110
|
+
if (expect.content !== undefined)
|
|
111
|
+
assistant.content = String(expect.content);
|
|
112
|
+
if (expect.content_match !== undefined)
|
|
113
|
+
assistant.content_match = expect.content_match;
|
|
114
|
+
if (expect.llm !== undefined)
|
|
115
|
+
assistant.llm = String(expect.llm);
|
|
116
|
+
if (assistant.content !== undefined || assistant.llm !== undefined)
|
|
117
|
+
out.assistant = assistant;
|
|
118
|
+
if (expect.tool_calls_match !== undefined)
|
|
119
|
+
out.tool_calls_match = expect.tool_calls_match;
|
|
120
|
+
if (tool_calls) {
|
|
121
|
+
out.tool_calls = tool_calls.map((tc, i) => {
|
|
122
|
+
const normalized2 = normalizeToolCallSource(tc, `${where}.tool_calls[${i}]`);
|
|
123
|
+
const entry = { name: normalized2.name };
|
|
124
|
+
if (normalized2.args !== undefined)
|
|
125
|
+
entry.arguments = normalized2.args;
|
|
126
|
+
return entry;
|
|
127
|
+
});
|
|
128
|
+
if (out.tool_calls_match === undefined)
|
|
129
|
+
out.tool_calls_match = "contains";
|
|
130
|
+
}
|
|
131
|
+
if (expect.tool_results_match !== undefined)
|
|
132
|
+
out.tool_results_match = expect.tool_results_match;
|
|
133
|
+
if (tool_results) {
|
|
134
|
+
out.tool_results = tool_results.map((tr, i) => {
|
|
135
|
+
const whereItem = `${where}.tool_results[${i}]`;
|
|
136
|
+
if (!tr || typeof tr !== "object")
|
|
137
|
+
throw new Error(`${whereItem} must be an object.`);
|
|
138
|
+
if ("name" in tr || "tool_call_id" in tr || "content" in tr || "content_json" in tr) {
|
|
139
|
+
const e = tr;
|
|
140
|
+
const entry = {};
|
|
141
|
+
if (e.name !== undefined)
|
|
142
|
+
entry.name = String(e.name);
|
|
143
|
+
if (e.tool_call_id !== undefined)
|
|
144
|
+
entry.tool_call_id = String(e.tool_call_id);
|
|
145
|
+
if (e.content !== undefined)
|
|
146
|
+
entry.content = String(e.content);
|
|
147
|
+
if (e.content_match !== undefined)
|
|
148
|
+
entry.content_match = e.content_match;
|
|
149
|
+
if (e.content_json !== undefined)
|
|
150
|
+
entry.content_json = e.content_json;
|
|
151
|
+
return entry;
|
|
152
|
+
}
|
|
153
|
+
const keys = Object.keys(tr);
|
|
154
|
+
if (keys.length !== 1)
|
|
155
|
+
throw new Error(`${whereItem} must be { name: ..., ... } or { toolName: { ...jsonSubset } }.`);
|
|
156
|
+
const name = keys[0];
|
|
157
|
+
const content_json = tr[name];
|
|
158
|
+
if (!content_json || typeof content_json !== "object")
|
|
159
|
+
throw new Error(`${whereItem}.${name} must be an object.`);
|
|
160
|
+
return { name, content_json };
|
|
161
|
+
});
|
|
162
|
+
if (out.tool_results_match === undefined)
|
|
163
|
+
out.tool_results_match = "contains";
|
|
164
|
+
}
|
|
165
|
+
return out;
|
|
166
|
+
}
|
|
167
|
+
function normalizeTranscript(entries, where) {
|
|
168
|
+
if (!Array.isArray(entries))
|
|
169
|
+
throw new Error(`${where} must be an array.`);
|
|
170
|
+
if (entries.length === 0)
|
|
171
|
+
throw new Error(`${where} must not be empty.`);
|
|
172
|
+
const last = entries[entries.length - 1];
|
|
173
|
+
if (!last || typeof last !== "object" || !("next_assistant" in last)) {
|
|
174
|
+
throw new Error(`${where} must end with { next_assistant: ... }.`);
|
|
175
|
+
}
|
|
176
|
+
let callCounter = 0;
|
|
177
|
+
const seenToolCallIds = new Set;
|
|
178
|
+
const nextCallId = (preferred) => {
|
|
179
|
+
const id = preferred ?? `call_${++callCounter}`;
|
|
180
|
+
if (!seenToolCallIds.has(id)) {
|
|
181
|
+
seenToolCallIds.add(id);
|
|
182
|
+
return id;
|
|
183
|
+
}
|
|
184
|
+
if (preferred)
|
|
185
|
+
return preferred;
|
|
186
|
+
throw new Error(`${where}: generated duplicate tool call id: ${id}`);
|
|
187
|
+
};
|
|
188
|
+
const inferSingleToolCallId = () => {
|
|
189
|
+
if (seenToolCallIds.size !== 1) {
|
|
190
|
+
throw new Error(`${where}: tool entry is missing tool_call_id and it cannot be inferred.`);
|
|
191
|
+
}
|
|
192
|
+
return [...seenToolCallIds][0];
|
|
193
|
+
};
|
|
194
|
+
const messages = [];
|
|
195
|
+
for (let i = 0;i < entries.length - 1; i++) {
|
|
196
|
+
const entry = entries[i];
|
|
197
|
+
const whereEntry = `${where}[${i}]`;
|
|
198
|
+
if (!entry || typeof entry !== "object")
|
|
199
|
+
throw new Error(`${whereEntry} must be an object.`);
|
|
200
|
+
if ("system" in entry) {
|
|
201
|
+
messages.push({ role: "system", content: String(entry.system) });
|
|
202
|
+
continue;
|
|
203
|
+
}
|
|
204
|
+
if ("user" in entry) {
|
|
205
|
+
messages.push({ role: "user", content: String(entry.user) });
|
|
206
|
+
continue;
|
|
207
|
+
}
|
|
208
|
+
if ("assistant" in entry) {
|
|
209
|
+
const a = entry.assistant;
|
|
210
|
+
if (typeof a === "string") {
|
|
211
|
+
messages.push({ role: "assistant", content: a });
|
|
212
|
+
continue;
|
|
213
|
+
}
|
|
214
|
+
if (!a || typeof a !== "object")
|
|
215
|
+
throw new Error(`${whereEntry}.assistant must be a string or object.`);
|
|
216
|
+
const tool_calls = normalizeToolCallsForMessage(a.tool_calls, nextCallId);
|
|
217
|
+
const msg = {
|
|
218
|
+
role: "assistant",
|
|
219
|
+
content: a.content !== undefined ? String(a.content) : "",
|
|
220
|
+
tool_calls
|
|
221
|
+
};
|
|
222
|
+
messages.push(msg);
|
|
223
|
+
continue;
|
|
224
|
+
}
|
|
225
|
+
if ("tool" in entry) {
|
|
226
|
+
const tool = normalizeTranscriptTool(entry.tool, `${whereEntry}.tool`, inferSingleToolCallId);
|
|
227
|
+
messages.push(tool);
|
|
228
|
+
continue;
|
|
229
|
+
}
|
|
230
|
+
throw new Error(`${whereEntry} must be one of: system, user, assistant, tool.`);
|
|
231
|
+
}
|
|
232
|
+
const expect = normalizeNextAssistantExpect(last.next_assistant, `${where}[${entries.length - 1}].next_assistant`);
|
|
233
|
+
return { messages, expect };
|
|
234
|
+
}
|
|
235
|
+
normalized.fixtures = file.fixtures.map((fixture, index) => {
|
|
52
236
|
if (!fixture || typeof fixture !== "object")
|
|
53
237
|
throw new Error(`fixtures[${index}] must be an object.`);
|
|
54
238
|
if (!("id" in fixture))
|
|
55
239
|
throw new Error(`fixtures[${index}].id is required.`);
|
|
56
|
-
if (!("
|
|
57
|
-
throw new Error(`fixtures[${index}].
|
|
58
|
-
const
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
out.expect.tool_calls_match = "contains";
|
|
240
|
+
if (!("transcript" in fixture))
|
|
241
|
+
throw new Error(`fixtures[${index}].transcript is required.`);
|
|
242
|
+
const id = String(fixture.id);
|
|
243
|
+
const transcript = fixture.transcript;
|
|
244
|
+
const { messages, expect } = normalizeTranscript(transcript, `fixtures[${index}].transcript`);
|
|
245
|
+
const out = { id, messages, expect };
|
|
246
|
+
if (fixture.agent)
|
|
247
|
+
out.agent = normalizeAgentSpec(fixture.agent, env);
|
|
65
248
|
return out;
|
|
66
249
|
});
|
|
67
250
|
return normalized;
|
|
@@ -202,9 +385,9 @@ ${stderrText}` : ""));
|
|
|
202
385
|
if (isError(payload))
|
|
203
386
|
throw new Error(payload.error.message);
|
|
204
387
|
if (!isSuccess(payload)) {
|
|
205
|
-
throw new Error(`Agent response must be { "message": { ... } } or { "error": { "message": ... } }.`);
|
|
388
|
+
throw new Error(`Agent response must be { "message": { ... }, "tool_messages"?: [...] } or { "error": { "message": ... } }.`);
|
|
206
389
|
}
|
|
207
|
-
return payload
|
|
390
|
+
return payload;
|
|
208
391
|
} finally {
|
|
209
392
|
proc.kill();
|
|
210
393
|
await Promise.allSettled([proc.exited, stderrPromise]);
|
|
@@ -299,16 +482,92 @@ function checkToolCalls(expect, message) {
|
|
|
299
482
|
}
|
|
300
483
|
return { status: "pass" };
|
|
301
484
|
}
|
|
302
|
-
|
|
485
|
+
function matchToolResult(expected, actual) {
|
|
486
|
+
if (expected.tool_call_id !== undefined && actual.tool_call_id !== expected.tool_call_id)
|
|
487
|
+
return false;
|
|
488
|
+
if (expected.name !== undefined && actual.name !== expected.name)
|
|
489
|
+
return false;
|
|
490
|
+
if (expected.content !== undefined) {
|
|
491
|
+
const mode = expected.content_match ?? "exact";
|
|
492
|
+
if (mode === "exact" && actual.content !== expected.content)
|
|
493
|
+
return false;
|
|
494
|
+
if (mode === "contains" && !actual.content.includes(expected.content))
|
|
495
|
+
return false;
|
|
496
|
+
}
|
|
497
|
+
if (expected.content_json !== undefined) {
|
|
498
|
+
const actualJson = parseJsonOrUndefined(actual.content);
|
|
499
|
+
if (actualJson === undefined)
|
|
500
|
+
return false;
|
|
501
|
+
if (!deepPartialMatch(expected.content_json, actualJson))
|
|
502
|
+
return false;
|
|
503
|
+
}
|
|
504
|
+
return true;
|
|
505
|
+
}
|
|
506
|
+
function checkToolResults(expect, toolMessages) {
|
|
507
|
+
const expectedResults = expect.tool_results ?? [];
|
|
508
|
+
if (expectedResults.length === 0)
|
|
509
|
+
return { status: "pass" };
|
|
510
|
+
const mode = expect.tool_results_match ?? "contains";
|
|
511
|
+
if (mode === "contains") {
|
|
512
|
+
for (const expected of expectedResults) {
|
|
513
|
+
if (!expected.name && !expected.tool_call_id) {
|
|
514
|
+
return { status: "fail", reason: "tool_results entries must include at least 'name' or 'tool_call_id'." };
|
|
515
|
+
}
|
|
516
|
+
const ok = toolMessages.some((actual) => matchToolResult(expected, actual));
|
|
517
|
+
if (!ok) {
|
|
518
|
+
return {
|
|
519
|
+
status: "fail",
|
|
520
|
+
reason: `Expected tool result not found${expected.name ? `: ${expected.name}` : ""}.`
|
|
521
|
+
};
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
return { status: "pass" };
|
|
525
|
+
}
|
|
526
|
+
if (toolMessages.length !== expectedResults.length) {
|
|
527
|
+
return {
|
|
528
|
+
status: "fail",
|
|
529
|
+
reason: `Expected exactly ${expectedResults.length} tool result(s), got ${toolMessages.length}.`
|
|
530
|
+
};
|
|
531
|
+
}
|
|
532
|
+
for (let i = 0;i < expectedResults.length; i++) {
|
|
533
|
+
const expected = expectedResults[i];
|
|
534
|
+
const actual = toolMessages[i];
|
|
535
|
+
if (!expected || !actual || !matchToolResult(expected, actual)) {
|
|
536
|
+
return { status: "fail", reason: `Tool result mismatch at index ${i}.` };
|
|
537
|
+
}
|
|
538
|
+
}
|
|
539
|
+
return { status: "pass" };
|
|
540
|
+
}
|
|
541
|
+
function checkAssistantContent(expect, message) {
|
|
542
|
+
const expected = expect.assistant?.content;
|
|
543
|
+
if (expected === undefined)
|
|
544
|
+
return { status: "pass" };
|
|
545
|
+
const mode = expect.assistant?.content_match ?? "contains";
|
|
546
|
+
if (mode === "exact") {
|
|
547
|
+
if (message.content !== expected)
|
|
548
|
+
return { status: "fail", reason: "Assistant content did not match exactly." };
|
|
549
|
+
return { status: "pass" };
|
|
550
|
+
}
|
|
551
|
+
if (!message.content.includes(expected))
|
|
552
|
+
return { status: "fail", reason: "Assistant content did not contain expected text." };
|
|
553
|
+
return { status: "pass" };
|
|
554
|
+
}
|
|
555
|
+
async function evaluateExpectations(expect, turn, judge) {
|
|
303
556
|
if (!expect)
|
|
304
557
|
return { status: "pass" };
|
|
305
|
-
const toolRes = checkToolCalls(expect, message);
|
|
558
|
+
const toolRes = checkToolCalls(expect, turn.message);
|
|
306
559
|
if (toolRes.status !== "pass")
|
|
307
560
|
return toolRes;
|
|
561
|
+
const toolResultsRes = checkToolResults(expect, turn.tool_messages ?? []);
|
|
562
|
+
if (toolResultsRes.status !== "pass")
|
|
563
|
+
return toolResultsRes;
|
|
564
|
+
const contentRes = checkAssistantContent(expect, turn.message);
|
|
565
|
+
if (contentRes.status !== "pass")
|
|
566
|
+
return contentRes;
|
|
308
567
|
if (expect.assistant?.llm) {
|
|
309
568
|
if (!judge)
|
|
310
569
|
return { status: "skip", reason: "LLM judge not configured." };
|
|
311
|
-
return await judge({ expectation: expect.assistant.llm, message });
|
|
570
|
+
return await judge({ expectation: expect.assistant.llm, message: turn.message });
|
|
312
571
|
}
|
|
313
572
|
return { status: "pass" };
|
|
314
573
|
}
|
|
@@ -343,18 +602,18 @@ async function runFixtureFile(path, options = {}) {
|
|
|
343
602
|
for (const fixture of file.fixtures) {
|
|
344
603
|
try {
|
|
345
604
|
const agent = pickAgentSpec(file, fixture);
|
|
346
|
-
const
|
|
347
|
-
|
|
348
|
-
messages,
|
|
605
|
+
const res = await callSubprocessAgent(agent, {
|
|
606
|
+
messages: fixture.messages,
|
|
349
607
|
tools: [],
|
|
350
608
|
tool_choice: "auto"
|
|
351
609
|
});
|
|
352
|
-
const expectation = await evaluateExpectations(fixture.expect,
|
|
610
|
+
const expectation = await evaluateExpectations(fixture.expect, res, judge);
|
|
353
611
|
results.push({
|
|
354
612
|
id: fixture.id,
|
|
355
613
|
status: expectation.status,
|
|
356
614
|
reason: expectation.reason,
|
|
357
|
-
message
|
|
615
|
+
message: res.message,
|
|
616
|
+
tool_messages: res.tool_messages
|
|
358
617
|
});
|
|
359
618
|
} catch (err) {
|
|
360
619
|
const reason = err instanceof Error ? err.message : String(err);
|
|
@@ -457,4 +716,4 @@ Summary: ${pass} passed, ${fail} failed, ${skip} skipped`);
|
|
|
457
716
|
}
|
|
458
717
|
await main();
|
|
459
718
|
|
|
460
|
-
//# debugId=
|
|
719
|
+
//# debugId=8780BA705018958964756E2164756E21
|