@huydao/karrot 0.1.1 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/GUIDE.md +3 -3
- package/README.md +192 -38
- package/dist/assertions/assertion.d.ts +7 -2
- package/dist/assertions/assertion.js +142 -1
- package/dist/executors/adapters/ag-ui.d.ts +2 -2
- package/dist/executors/adapters/ag-ui.js +379 -186
- package/dist/executors/execute.js +5 -4
- package/dist/prompts/turn-eval-system-prompt.md +31 -1
- package/dist/prompts/turn-message-gen-system-prompt.md +4 -2
- package/dist/reports/report.d.ts +1 -1
- package/dist/reports/report.js +18 -0
- package/dist/scenarios/generated-message.js +2 -0
- package/dist/scenarios/scenario-loader.d.ts +1 -1
- package/dist/scenarios/scenario-loader.js +41 -3
- package/dist/scenarios/scenario.d.ts +11 -7
- package/dist/utils/config.d.ts +2 -4
- package/package.json +12 -5
package/GUIDE.md
CHANGED
|
@@ -70,7 +70,7 @@ await execute('./karrot.config.yml', {
|
|
|
70
70
|
|
|
71
71
|
A scenario module must export:
|
|
72
72
|
- `scenarioSet`
|
|
73
|
-
- `buildScenarioContext(
|
|
73
|
+
- `buildScenarioContext(baseContext)`
|
|
74
74
|
|
|
75
75
|
Example:
|
|
76
76
|
|
|
@@ -96,9 +96,9 @@ const scenarios: AiScenario<DemoContext>[] = [
|
|
|
96
96
|
|
|
97
97
|
export const scenarioSet = new AiScenarioSet(scenarios);
|
|
98
98
|
|
|
99
|
-
export function buildScenarioContext(
|
|
99
|
+
export function buildScenarioContext(baseContext: BaseAiScenarioContext): DemoContext {
|
|
100
100
|
return {
|
|
101
|
-
|
|
101
|
+
...baseContext,
|
|
102
102
|
projectLabel: 'RA Sample Project',
|
|
103
103
|
};
|
|
104
104
|
}
|
package/README.md
CHANGED
|
@@ -38,7 +38,7 @@ import { execute } from '@huydao/karrot';
|
|
|
38
38
|
|
|
39
39
|
await execute('./karrot.config.yml', {
|
|
40
40
|
variables: {
|
|
41
|
-
PROJECT_ID:
|
|
41
|
+
PROJECT_ID: process.env.PROJECT_ID,
|
|
42
42
|
JWT: process.env.JWT,
|
|
43
43
|
ACCOUNT_ID: process.env.ACCOUNT_ID,
|
|
44
44
|
WS_URL: process.env.WS_URL,
|
|
@@ -58,11 +58,102 @@ await execute('./karrot.config.yml', {
|
|
|
58
58
|
5. run selected scenarios
|
|
59
59
|
6. write JSON and HTML reports
|
|
60
60
|
|
|
61
|
-
##
|
|
61
|
+
## Recommended Setup Flow
|
|
62
|
+
|
|
63
|
+
The normal setup path is:
|
|
64
|
+
1. create a YAML config file for the WSS transport
|
|
65
|
+
2. create a scenario module that exports `scenarioSet` and `buildScenarioContext`
|
|
66
|
+
3. create a small run script that calls `execute()`
|
|
67
|
+
|
|
68
|
+
### 1. WSS config in YAML
|
|
69
|
+
|
|
70
|
+
Use one config file to describe transport, evaluation prompt settings, artifacts, and reporting.
|
|
71
|
+
|
|
72
|
+
```yml
|
|
73
|
+
version: 1
|
|
74
|
+
|
|
75
|
+
transport:
|
|
76
|
+
type: ag-ui-wss
|
|
77
|
+
env:
|
|
78
|
+
JWT: ${JWT}
|
|
79
|
+
ACCOUNT_ID: ${ACCOUNT_ID}
|
|
80
|
+
PROJECT_ID: ${PROJECT_ID}
|
|
81
|
+
AGENT_URL: ${AGENT_URL}
|
|
82
|
+
AGENT_ID: ${AGENT_ID}
|
|
83
|
+
WS_URL: ${WS_URL}
|
|
84
|
+
WS_TOPIC: ${WS_TOPIC}
|
|
85
|
+
WS_STOMP_HEADERS: Authorization:${JWT}
|
|
86
|
+
WS_HEADERS: Origin:${WS_ORIGIN},User-Agent:Mozilla/5.0
|
|
87
|
+
processTimeoutMs: 120000
|
|
88
|
+
|
|
89
|
+
artifacts:
|
|
90
|
+
directory: ./artifacts
|
|
91
|
+
|
|
92
|
+
execution:
|
|
93
|
+
stopOnFailure: false
|
|
94
|
+
|
|
95
|
+
evaluation:
|
|
96
|
+
systemPromptPath: ./prompts/turn-eval-system-prompt.md
|
|
97
|
+
promptDirectory: ./prompts/eval
|
|
98
|
+
|
|
99
|
+
context:
|
|
100
|
+
projectId: ${PROJECT_ID}
|
|
101
|
+
|
|
102
|
+
report:
|
|
103
|
+
enabled: true
|
|
104
|
+
environment: prod
|
|
105
|
+
projectName: Demo Project
|
|
106
|
+
runtime:
|
|
107
|
+
agentUrl: ${AGENT_URL}
|
|
108
|
+
agentId: ${AGENT_ID}
|
|
109
|
+
wsUrl: ${WS_URL}
|
|
110
|
+
wsTopic: ${WS_TOPIC}
|
|
111
|
+
accountId: ${ACCOUNT_ID}
|
|
112
|
+
projectId: ${PROJECT_ID}
|
|
113
|
+
appBaseUrl: ${APP_BASE_URL}
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
What this does:
|
|
117
|
+
- `transport`: tells Karrot how to talk to the assistant
|
|
118
|
+
- `evaluation`: points to the turn-eval rubric and any extra project-specific dimension prompts
|
|
119
|
+
- `context`: makes resolved values available to scenarios
|
|
120
|
+
- `report`: controls run metadata written into reports
|
|
121
|
+
|
|
122
|
+
### 2. Scenario module
|
|
123
|
+
|
|
124
|
+
A scenario module defines the multi-turn tests that Karrot will run.
|
|
125
|
+
|
|
126
|
+
### 3. Run script
|
|
127
|
+
|
|
128
|
+
Use a small script to resolve variables and point Karrot at the scenario file.
|
|
129
|
+
|
|
130
|
+
```ts
|
|
131
|
+
import { execute } from '@huydao/karrot';
|
|
132
|
+
|
|
133
|
+
await execute('./karrot.config.yml', {
|
|
134
|
+
variables: {
|
|
135
|
+
PROJECT_ID: process.env.PROJECT_ID,
|
|
136
|
+
JWT: process.env.JWT,
|
|
137
|
+
ACCOUNT_ID: process.env.ACCOUNT_ID,
|
|
138
|
+
AGENT_URL: process.env.AGENT_URL,
|
|
139
|
+
AGENT_ID: process.env.AGENT_ID,
|
|
140
|
+
WS_URL: process.env.WS_URL,
|
|
141
|
+
WS_TOPIC: process.env.WS_TOPIC,
|
|
142
|
+
WS_ORIGIN: process.env.WS_ORIGIN,
|
|
143
|
+
APP_BASE_URL: process.env.APP_BASE_URL,
|
|
144
|
+
},
|
|
145
|
+
scenario: {
|
|
146
|
+
file: './src/scenarios/basic-two-turn-demo.ts',
|
|
147
|
+
ids: ['BASIC-2T'],
|
|
148
|
+
},
|
|
149
|
+
});
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
## Scenario Structure
|
|
62
153
|
|
|
63
154
|
A scenario module exports:
|
|
64
155
|
- `scenarioSet`
|
|
65
|
-
- `buildScenarioContext(
|
|
156
|
+
- `buildScenarioContext(baseContext)`
|
|
66
157
|
|
|
67
158
|
Minimal example:
|
|
68
159
|
|
|
@@ -88,8 +179,68 @@ const scenarios: AiScenario<BaseAiScenarioContext>[] = [
|
|
|
88
179
|
|
|
89
180
|
export const scenarioSet = new AiScenarioSet(scenarios);
|
|
90
181
|
|
|
91
|
-
export function buildScenarioContext(
|
|
92
|
-
return {
|
|
182
|
+
export function buildScenarioContext(baseContext: BaseAiScenarioContext): BaseAiScenarioContext {
|
|
183
|
+
return { ...baseContext };
|
|
184
|
+
}
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
### Scenario shape
|
|
188
|
+
|
|
189
|
+
Each scenario typically contains:
|
|
190
|
+
- `id`: stable scenario identifier
|
|
191
|
+
- `name`: human-readable scenario name
|
|
192
|
+
- `turns`: ordered list of user turns to execute
|
|
193
|
+
|
|
194
|
+
Each turn supports:
|
|
195
|
+
- `label`: display label in reports
|
|
196
|
+
- `message`: the user message to send
|
|
197
|
+
- `idleTimeoutMs`: optional wait limit for message inactivity
|
|
198
|
+
- `processTimeoutMs`: optional hard timeout for the turn
|
|
199
|
+
- `assertions`: pass/fail checks for the turn output
|
|
200
|
+
- `eval`: quality scoring dimensions for the turn output
|
|
201
|
+
- `onComplete`: optional callback for turn-level post-processing
|
|
202
|
+
|
|
203
|
+
### Message options
|
|
204
|
+
|
|
205
|
+
`message` can be:
|
|
206
|
+
- a function `(context) => string`
|
|
207
|
+
- `aiGen.fromPreviousContext()`
|
|
208
|
+
- `aiGen.fromGuidance(guidance)`
|
|
209
|
+
- `aiGen.fromContent(content)`
|
|
210
|
+
|
|
211
|
+
This gives you a few common scenario authoring patterns:
|
|
212
|
+
- fixed prompts for deterministic tests
|
|
213
|
+
- context-aware prompts that use scenario data
|
|
214
|
+
- generated user prompts for more adaptive multi-turn flows
|
|
215
|
+
|
|
216
|
+
Example with assertions and eval on a turn:
|
|
217
|
+
|
|
218
|
+
```ts
|
|
219
|
+
import { AiScenarioSet, aiGen, type AiScenario, type BaseAiScenarioContext } from '@huydao/karrot';
|
|
220
|
+
|
|
221
|
+
const scenarios: AiScenario<BaseAiScenarioContext>[] = [
|
|
222
|
+
{
|
|
223
|
+
id: 'FOLLOW-UP-1',
|
|
224
|
+
name: 'Follow-up prompt generation',
|
|
225
|
+
turns: [
|
|
226
|
+
{
|
|
227
|
+
label: 'Ask for next prompts',
|
|
228
|
+
message: aiGen.fromGuidance(
|
|
229
|
+
'Ask for 3 concise follow-up prompts the user can send next based on the previous answer.',
|
|
230
|
+
),
|
|
231
|
+
assertions: [
|
|
232
|
+
{ assert: { hasText: 'prompt' } },
|
|
233
|
+
],
|
|
234
|
+
eval: ['correctness', 'helpfulness', 'relevance'],
|
|
235
|
+
},
|
|
236
|
+
],
|
|
237
|
+
},
|
|
238
|
+
];
|
|
239
|
+
|
|
240
|
+
export const scenarioSet = new AiScenarioSet(scenarios);
|
|
241
|
+
|
|
242
|
+
export function buildScenarioContext(baseContext: BaseAiScenarioContext): BaseAiScenarioContext {
|
|
243
|
+
return { ...baseContext };
|
|
93
244
|
}
|
|
94
245
|
```
|
|
95
246
|
|
|
@@ -97,6 +248,8 @@ export function buildScenarioContext(projectId: string): BaseAiScenarioContext {
|
|
|
97
248
|
|
|
98
249
|
Karrot supports two assertion styles.
|
|
99
250
|
|
|
251
|
+
Use assertions for pass/fail requirements. If a turn must contain or avoid something specific, assertions are the right tool.
|
|
252
|
+
|
|
100
253
|
Direct assertions:
|
|
101
254
|
|
|
102
255
|
```ts
|
|
@@ -115,9 +268,15 @@ assertions: [
|
|
|
115
268
|
]
|
|
116
269
|
```
|
|
117
270
|
|
|
271
|
+
Assertion guidance:
|
|
272
|
+
- Use direct assertions when the expected output is deterministic enough to check literally.
|
|
273
|
+
- Use AI assertions when the requirement is semantic and cannot be captured safely with exact string matching.
|
|
274
|
+
- Use assertions to decide whether the turn satisfied a contract, not to measure answer quality.
|
|
275
|
+
|
|
118
276
|
## Evaluations
|
|
119
277
|
|
|
120
278
|
Turn evals score the assistant response for named dimensions.
|
|
279
|
+
Karrot applies a CheckEval-inspired evaluation rubric: broad dimensions are decomposed into concrete checklist-style checks before assigning a final score, which improves consistency and makes explanations more traceable.
|
|
121
280
|
|
|
122
281
|
```ts
|
|
123
282
|
eval: ['correctness', 'coverage', 'helpfulness']
|
|
@@ -135,12 +294,37 @@ eval: [
|
|
|
135
294
|
]
|
|
136
295
|
```
|
|
137
296
|
|
|
297
|
+
Use eval when you want a quality score rather than a hard pass/fail rule.
|
|
298
|
+
|
|
299
|
+
Built-in dimensions commonly used by Karrot:
|
|
300
|
+
- `correctness`
|
|
301
|
+
- `coverage`
|
|
302
|
+
- `helpfulness`
|
|
303
|
+
- `clarity`
|
|
304
|
+
- `completeness`
|
|
305
|
+
- `conciseness`
|
|
306
|
+
- `relevance`
|
|
307
|
+
- `actionability`
|
|
308
|
+
- `structure`
|
|
309
|
+
- `consistency`
|
|
310
|
+
- `safety`
|
|
311
|
+
|
|
138
312
|
Project-level eval prompts can be configured through:
|
|
139
313
|
- `evaluation.systemPromptPath`
|
|
140
314
|
- `evaluation.promptDirectory`
|
|
141
315
|
|
|
142
316
|
That lets the project define rubric files without repeating inline guidance in every scenario.
|
|
143
317
|
|
|
318
|
+
Use:
|
|
319
|
+
- `systemPromptPath` when you want to replace the whole turn-eval rubric
|
|
320
|
+
- `promptDirectory` when you want to add custom project-specific dimensions
|
|
321
|
+
|
|
322
|
+
Eval guidance:
|
|
323
|
+
- Use assertions for required behavior.
|
|
324
|
+
- Use eval for quality measurement across dimensions.
|
|
325
|
+
- Prefer a small number of dimensions that reflect the goal of the turn.
|
|
326
|
+
- Because Karrot applies CheckEval-style scoring, dimensions like `relevance` and `consistency` are judged through concrete sub-checks instead of a vague overall impression.
|
|
327
|
+
|
|
144
328
|
## AI-Generated User Messages
|
|
145
329
|
|
|
146
330
|
Karrot can generate a user turn message before sending it to the target assistant.
|
|
@@ -173,39 +357,9 @@ Karrot config currently supports:
|
|
|
173
357
|
- `context`
|
|
174
358
|
- `report`
|
|
175
359
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
version: 1
|
|
180
|
-
|
|
181
|
-
transport:
|
|
182
|
-
type: ag-ui-wss
|
|
183
|
-
env:
|
|
184
|
-
JWT: ${JWT}
|
|
185
|
-
ACCOUNT_ID: ${ACCOUNT_ID}
|
|
186
|
-
PROJECT_ID: ${PROJECT_ID}
|
|
187
|
-
AGENT_URL: ${AGENT_URL}
|
|
188
|
-
AGENT_ID: ${AGENT_ID}
|
|
189
|
-
WS_URL: ${WS_URL}
|
|
190
|
-
WS_TOPIC: ${WS_TOPIC}
|
|
191
|
-
WS_STOMP_HEADERS: Authorization:${JWT}
|
|
192
|
-
WS_HEADERS: Origin:${WS_ORIGIN},User-Agent:Mozilla/5.0
|
|
193
|
-
|
|
194
|
-
context:
|
|
195
|
-
projectId: ${PROJECT_ID}
|
|
196
|
-
|
|
197
|
-
report:
|
|
198
|
-
environment: prod
|
|
199
|
-
projectName: Demo Project
|
|
200
|
-
runtime:
|
|
201
|
-
agentUrl: ${AGENT_URL}
|
|
202
|
-
agentId: ${AGENT_ID}
|
|
203
|
-
wsUrl: ${WS_URL}
|
|
204
|
-
wsTopic: ${WS_TOPIC}
|
|
205
|
-
accountId: ${ACCOUNT_ID}
|
|
206
|
-
projectId: ${PROJECT_ID}
|
|
207
|
-
appBaseUrl: ${APP_BASE_URL}
|
|
208
|
-
```
|
|
360
|
+
Important design choice:
|
|
361
|
+
- config and scenario are separate
|
|
362
|
+
- one transport config can be reused across many scenario files
|
|
209
363
|
|
|
210
364
|
## Reports and Artifacts
|
|
211
365
|
|
|
@@ -1,8 +1,12 @@
|
|
|
1
1
|
import { type AiTurnAssertion } from '../scenarios/scenario';
|
|
2
2
|
export type AssertionEvaluationResult = {
|
|
3
3
|
kind: 'assert' | 'aiAssert';
|
|
4
|
-
matcher: 'hasText' | 'toolcall' | 'hasContent' | 'notHasContent';
|
|
5
|
-
expected: string | string[]
|
|
4
|
+
matcher: 'hasText' | 'toolcall' | 'toolcallWithContent' | 'hasContent' | 'notHasContent';
|
|
5
|
+
expected: string | string[] | {
|
|
6
|
+
name: string;
|
|
7
|
+
hasText?: string | string[];
|
|
8
|
+
hasProperties?: Record<string, unknown>;
|
|
9
|
+
};
|
|
6
10
|
description?: string;
|
|
7
11
|
passed: boolean;
|
|
8
12
|
reason: string;
|
|
@@ -13,6 +17,7 @@ type EvaluateTurnAssertionsOptions = {
|
|
|
13
17
|
toolCalls?: string[];
|
|
14
18
|
env?: NodeJS.ProcessEnv;
|
|
15
19
|
outputDirectory?: string;
|
|
20
|
+
outputPath?: string;
|
|
16
21
|
};
|
|
17
22
|
export declare function evaluateTurnAssertions(options: EvaluateTurnAssertionsOptions): Promise<AssertionEvaluationResult[]>;
|
|
18
23
|
export {};
|
|
@@ -12,6 +12,14 @@ function normalizeAssertion(assertion) {
|
|
|
12
12
|
description: assertion.description,
|
|
13
13
|
};
|
|
14
14
|
}
|
|
15
|
+
if ('toolcallWithContent' in assertion.assert) {
|
|
16
|
+
return {
|
|
17
|
+
kind: 'assert',
|
|
18
|
+
matcher: 'toolcallWithContent',
|
|
19
|
+
expected: assertion.assert.toolcallWithContent,
|
|
20
|
+
description: assertion.description,
|
|
21
|
+
};
|
|
22
|
+
}
|
|
15
23
|
return {
|
|
16
24
|
kind: 'assert',
|
|
17
25
|
matcher: 'toolcall',
|
|
@@ -35,11 +43,32 @@ function normalizeAssertion(assertion) {
|
|
|
35
43
|
};
|
|
36
44
|
}
|
|
37
45
|
function formatExpectedValue(expected) {
|
|
46
|
+
if (typeof expected === 'object' && expected !== null && !Array.isArray(expected)) {
|
|
47
|
+
return JSON.stringify(expected);
|
|
48
|
+
}
|
|
38
49
|
if (Array.isArray(expected)) {
|
|
39
50
|
return `[${expected.join(', ')}]`;
|
|
40
51
|
}
|
|
41
52
|
return `"${expected}"`;
|
|
42
53
|
}
|
|
54
|
+
function isPlainObject(value) {
|
|
55
|
+
return typeof value === 'object' && value !== null && !Array.isArray(value);
|
|
56
|
+
}
|
|
57
|
+
function matchesExpectedProperties(actual, expected) {
|
|
58
|
+
if (Array.isArray(expected)) {
|
|
59
|
+
if (!Array.isArray(actual) || actual.length < expected.length) {
|
|
60
|
+
return false;
|
|
61
|
+
}
|
|
62
|
+
return expected.every((expectedItem, index) => matchesExpectedProperties(actual[index], expectedItem));
|
|
63
|
+
}
|
|
64
|
+
if (isPlainObject(expected)) {
|
|
65
|
+
if (!isPlainObject(actual)) {
|
|
66
|
+
return false;
|
|
67
|
+
}
|
|
68
|
+
return Object.entries(expected).every(([key, value]) => matchesExpectedProperties(actual[key], value));
|
|
69
|
+
}
|
|
70
|
+
return Object.is(actual, expected);
|
|
71
|
+
}
|
|
43
72
|
function buildAssertionReason(result) {
|
|
44
73
|
const prefix = result.description ? `${result.description}: ` : '';
|
|
45
74
|
return `${prefix}${result.matcher}(${formatExpectedValue(result.expected)})`;
|
|
@@ -130,6 +159,18 @@ async function evaluateAiAssertion(assertion, output, env) {
|
|
|
130
159
|
description: assertion.description,
|
|
131
160
|
});
|
|
132
161
|
}
|
|
162
|
+
async function readOutputLog(outputPath) {
|
|
163
|
+
if (!outputPath) {
|
|
164
|
+
return '';
|
|
165
|
+
}
|
|
166
|
+
try {
|
|
167
|
+
const { readFile } = await import('node:fs/promises');
|
|
168
|
+
return await readFile(outputPath, 'utf8');
|
|
169
|
+
}
|
|
170
|
+
catch {
|
|
171
|
+
return '';
|
|
172
|
+
}
|
|
173
|
+
}
|
|
133
174
|
function evaluateToolCallAssertion(assertion, toolCalls) {
|
|
134
175
|
const expectedToolCalls = (Array.isArray(assertion.expected) ? assertion.expected : [])
|
|
135
176
|
.map((toolCall) => toolCall.trim())
|
|
@@ -166,11 +207,99 @@ function evaluateToolCallAssertion(assertion, toolCalls) {
|
|
|
166
207
|
: `${buildNormalizedAssertionReason(normalizedToolCallAssertion)} failed. Missing: ${formatExpectedValue(missingToolCalls)}. Observed: ${formatExpectedValue(actualToolCalls)}.`,
|
|
167
208
|
};
|
|
168
209
|
}
|
|
210
|
+
function evaluateToolCallWithContentAssertion(assertion, logContent) {
|
|
211
|
+
const expected = typeof assertion.expected === 'object' && assertion.expected !== null && !Array.isArray(assertion.expected)
|
|
212
|
+
? assertion.expected
|
|
213
|
+
: undefined;
|
|
214
|
+
if (!expected) {
|
|
215
|
+
return {
|
|
216
|
+
kind: assertion.kind,
|
|
217
|
+
matcher: assertion.matcher,
|
|
218
|
+
expected: assertion.expected,
|
|
219
|
+
description: assertion.description,
|
|
220
|
+
passed: false,
|
|
221
|
+
reason: `${buildNormalizedAssertionReason(assertion)} failed. Invalid toolcallWithContent expectation.`,
|
|
222
|
+
};
|
|
223
|
+
}
|
|
224
|
+
const expectedTexts = (Array.isArray(expected.hasText) ? expected.hasText : [expected.hasText])
|
|
225
|
+
.filter((value) => typeof value === 'string')
|
|
226
|
+
.map((value) => value.trim())
|
|
227
|
+
.filter(Boolean);
|
|
228
|
+
const matchingToolCallIds = new Set();
|
|
229
|
+
const eventPayloadsByToolCallId = new Map();
|
|
230
|
+
const parsedArgumentsByToolCallId = new Map();
|
|
231
|
+
for (const rawLine of logContent.split('\n')) {
|
|
232
|
+
const line = rawLine.trim();
|
|
233
|
+
if (!line) {
|
|
234
|
+
continue;
|
|
235
|
+
}
|
|
236
|
+
try {
|
|
237
|
+
const event = JSON.parse(line);
|
|
238
|
+
if (!event.toolCallId) {
|
|
239
|
+
continue;
|
|
240
|
+
}
|
|
241
|
+
if (event.toolCallName === expected.name) {
|
|
242
|
+
matchingToolCallIds.add(event.toolCallId);
|
|
243
|
+
}
|
|
244
|
+
if (matchingToolCallIds.has(event.toolCallId)) {
|
|
245
|
+
eventPayloadsByToolCallId.set(event.toolCallId, [
|
|
246
|
+
...(eventPayloadsByToolCallId.get(event.toolCallId) ?? []),
|
|
247
|
+
line,
|
|
248
|
+
]);
|
|
249
|
+
if (typeof event.arguments === 'string' && event.arguments.trim()) {
|
|
250
|
+
try {
|
|
251
|
+
const parsedArguments = JSON.parse(event.arguments);
|
|
252
|
+
parsedArgumentsByToolCallId.set(event.toolCallId, [
|
|
253
|
+
...(parsedArgumentsByToolCallId.get(event.toolCallId) ?? []),
|
|
254
|
+
parsedArguments,
|
|
255
|
+
]);
|
|
256
|
+
}
|
|
257
|
+
catch {
|
|
258
|
+
// Ignore unparsable arguments and fall back to raw text matching.
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
catch {
|
|
264
|
+
continue;
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
if (matchingToolCallIds.size === 0) {
|
|
268
|
+
return {
|
|
269
|
+
kind: assertion.kind,
|
|
270
|
+
matcher: assertion.matcher,
|
|
271
|
+
expected,
|
|
272
|
+
description: assertion.description,
|
|
273
|
+
passed: false,
|
|
274
|
+
reason: `${buildNormalizedAssertionReason(assertion)} failed. Tool call "${expected.name}" was not found in the run log.`,
|
|
275
|
+
};
|
|
276
|
+
}
|
|
277
|
+
const combinedPayload = [...matchingToolCallIds]
|
|
278
|
+
.flatMap((toolCallId) => eventPayloadsByToolCallId.get(toolCallId) ?? [])
|
|
279
|
+
.join('\n');
|
|
280
|
+
const missingTexts = expectedTexts.filter((text) => !combinedPayload.includes(text));
|
|
281
|
+
const hasPropertiesMatch = expected.hasProperties === undefined ||
|
|
282
|
+
[...matchingToolCallIds].some((toolCallId) => (parsedArgumentsByToolCallId.get(toolCallId) ?? []).some((parsedArguments) => matchesExpectedProperties(parsedArguments, expected.hasProperties)));
|
|
283
|
+
const passed = missingTexts.length === 0 && hasPropertiesMatch;
|
|
284
|
+
return {
|
|
285
|
+
kind: assertion.kind,
|
|
286
|
+
matcher: assertion.matcher,
|
|
287
|
+
expected,
|
|
288
|
+
description: assertion.description,
|
|
289
|
+
passed,
|
|
290
|
+
reason: passed
|
|
291
|
+
? `${buildNormalizedAssertionReason(assertion)} passed.`
|
|
292
|
+
: !hasPropertiesMatch
|
|
293
|
+
? `${buildNormalizedAssertionReason(assertion)} failed. Missing properties: ${JSON.stringify(expected.hasProperties ?? {})}.`
|
|
294
|
+
: `${buildNormalizedAssertionReason(assertion)} failed. Missing text: ${formatExpectedValue(missingTexts)}.`,
|
|
295
|
+
};
|
|
296
|
+
}
|
|
169
297
|
async function evaluateTurnAssertions(options) {
|
|
170
298
|
if (!options.assertions?.length) {
|
|
171
299
|
return [];
|
|
172
300
|
}
|
|
173
301
|
const results = [];
|
|
302
|
+
let outputLogContent;
|
|
174
303
|
for (const rawAssertion of options.assertions) {
|
|
175
304
|
const assertion = normalizeAssertion(rawAssertion);
|
|
176
305
|
if (assertion.kind === 'assert') {
|
|
@@ -178,7 +307,19 @@ async function evaluateTurnAssertions(options) {
|
|
|
178
307
|
results.push(evaluateToolCallAssertion(assertion, options.toolCalls));
|
|
179
308
|
continue;
|
|
180
309
|
}
|
|
181
|
-
|
|
310
|
+
if (assertion.matcher === 'toolcallWithContent') {
|
|
311
|
+
outputLogContent ??= await readOutputLog(options.outputPath);
|
|
312
|
+
results.push(evaluateToolCallWithContentAssertion(assertion, outputLogContent));
|
|
313
|
+
continue;
|
|
314
|
+
}
|
|
315
|
+
if (assertion.matcher !== 'hasText') {
|
|
316
|
+
continue;
|
|
317
|
+
}
|
|
318
|
+
const expected = typeof assertion.expected === 'string'
|
|
319
|
+
? assertion.expected
|
|
320
|
+
: Array.isArray(assertion.expected)
|
|
321
|
+
? assertion.expected.join(', ')
|
|
322
|
+
: JSON.stringify(assertion.expected);
|
|
182
323
|
const passed = options.output.includes(expected);
|
|
183
324
|
results.push({
|
|
184
325
|
kind: assertion.kind,
|
|
@@ -8,8 +8,8 @@ type RunAgUiMessageOptions = {
|
|
|
8
8
|
allowIdleTimeoutWithAssistantText?: boolean;
|
|
9
9
|
processTimeoutMs?: number;
|
|
10
10
|
};
|
|
11
|
-
|
|
11
|
+
declare function parseExecutionTestResultId(output: string): string | undefined;
|
|
12
|
+
export { parseExecutionTestResultId };
|
|
12
13
|
export declare function extractToolCallNames(logContent: string): string[];
|
|
13
14
|
export declare function extractAppendedLog(previousLogContent: string, latestLogContent: string): string;
|
|
14
15
|
export declare function runAgUiMessage(options: RunAgUiMessageOptions): Promise<MessageRunResult>;
|
|
15
|
-
export {};
|