@huydao/karrot 0.1.1 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/GUIDE.md CHANGED
@@ -70,7 +70,7 @@ await execute('./karrot.config.yml', {
70
70
 
71
71
  A scenario module must export:
72
72
  - `scenarioSet`
73
- - `buildScenarioContext(projectId)`
73
+ - `buildScenarioContext(baseContext)`
74
74
 
75
75
  Example:
76
76
 
@@ -96,9 +96,9 @@ const scenarios: AiScenario<DemoContext>[] = [
96
96
 
97
97
  export const scenarioSet = new AiScenarioSet(scenarios);
98
98
 
99
- export function buildScenarioContext(projectId: string): DemoContext {
99
+ export function buildScenarioContext(baseContext: BaseAiScenarioContext): DemoContext {
100
100
  return {
101
- projectId,
101
+ ...baseContext,
102
102
  projectLabel: 'RA Sample Project',
103
103
  };
104
104
  }
package/README.md CHANGED
@@ -38,7 +38,7 @@ import { execute } from '@huydao/karrot';
38
38
 
39
39
  await execute('./karrot.config.yml', {
40
40
  variables: {
41
- PROJECT_ID: '3422056',
41
+ PROJECT_ID: process.env.PROJECT_ID,
42
42
  JWT: process.env.JWT,
43
43
  ACCOUNT_ID: process.env.ACCOUNT_ID,
44
44
  WS_URL: process.env.WS_URL,
@@ -58,11 +58,102 @@ await execute('./karrot.config.yml', {
58
58
  5. run selected scenarios
59
59
  6. write JSON and HTML reports
60
60
 
61
- ## Scenario Authoring
61
+ ## Recommended Setup Flow
62
+
63
+ The normal setup path is:
64
+ 1. create a YAML config file for the WSS transport
65
+ 2. create a scenario module that exports `scenarioSet` and `buildScenarioContext`
66
+ 3. create a small run script that calls `execute()`
67
+
68
+ ### 1. WSS config in YAML
69
+
70
+ Use one config file to describe transport, evaluation prompt settings, artifacts, and reporting.
71
+
72
+ ```yml
73
+ version: 1
74
+
75
+ transport:
76
+ type: ag-ui-wss
77
+ env:
78
+ JWT: ${JWT}
79
+ ACCOUNT_ID: ${ACCOUNT_ID}
80
+ PROJECT_ID: ${PROJECT_ID}
81
+ AGENT_URL: ${AGENT_URL}
82
+ AGENT_ID: ${AGENT_ID}
83
+ WS_URL: ${WS_URL}
84
+ WS_TOPIC: ${WS_TOPIC}
85
+ WS_STOMP_HEADERS: Authorization:${JWT}
86
+ WS_HEADERS: Origin:${WS_ORIGIN},User-Agent:Mozilla/5.0
87
+ processTimeoutMs: 120000
88
+
89
+ artifacts:
90
+ directory: ./artifacts
91
+
92
+ execution:
93
+ stopOnFailure: false
94
+
95
+ evaluation:
96
+ systemPromptPath: ./prompts/turn-eval-system-prompt.md
97
+ promptDirectory: ./prompts/eval
98
+
99
+ context:
100
+ projectId: ${PROJECT_ID}
101
+
102
+ report:
103
+ enabled: true
104
+ environment: prod
105
+ projectName: Demo Project
106
+ runtime:
107
+ agentUrl: ${AGENT_URL}
108
+ agentId: ${AGENT_ID}
109
+ wsUrl: ${WS_URL}
110
+ wsTopic: ${WS_TOPIC}
111
+ accountId: ${ACCOUNT_ID}
112
+ projectId: ${PROJECT_ID}
113
+ appBaseUrl: ${APP_BASE_URL}
114
+ ```
115
+
116
+ What this does:
117
+ - `transport`: tells Karrot how to talk to the assistant
118
+ - `evaluation`: points to the turn-eval rubric and any extra project-specific dimension prompts
119
+ - `context`: makes resolved values available to scenarios
120
+ - `report`: controls run metadata written into reports
121
+
122
+ ### 2. Scenario module
123
+
124
+ A scenario module defines the multi-turn tests that Karrot will run.
125
+
126
+ ### 3. Run script
127
+
128
+ Use a small script to resolve variables and point Karrot at the scenario file.
129
+
130
+ ```ts
131
+ import { execute } from '@huydao/karrot';
132
+
133
+ await execute('./karrot.config.yml', {
134
+ variables: {
135
+ PROJECT_ID: process.env.PROJECT_ID,
136
+ JWT: process.env.JWT,
137
+ ACCOUNT_ID: process.env.ACCOUNT_ID,
138
+ AGENT_URL: process.env.AGENT_URL,
139
+ AGENT_ID: process.env.AGENT_ID,
140
+ WS_URL: process.env.WS_URL,
141
+ WS_TOPIC: process.env.WS_TOPIC,
142
+ WS_ORIGIN: process.env.WS_ORIGIN,
143
+ APP_BASE_URL: process.env.APP_BASE_URL,
144
+ },
145
+ scenario: {
146
+ file: './src/scenarios/basic-two-turn-demo.ts',
147
+ ids: ['BASIC-2T'],
148
+ },
149
+ });
150
+ ```
151
+
152
+ ## Scenario Structure
62
153
 
63
154
  A scenario module exports:
64
155
  - `scenarioSet`
65
- - `buildScenarioContext(projectId)`
156
+ - `buildScenarioContext(baseContext)`
66
157
 
67
158
  Minimal example:
68
159
 
@@ -88,8 +179,68 @@ const scenarios: AiScenario<BaseAiScenarioContext>[] = [
88
179
 
89
180
  export const scenarioSet = new AiScenarioSet(scenarios);
90
181
 
91
- export function buildScenarioContext(projectId: string): BaseAiScenarioContext {
92
- return { projectId };
182
+ export function buildScenarioContext(baseContext: BaseAiScenarioContext): BaseAiScenarioContext {
183
+ return { ...baseContext };
184
+ }
185
+ ```
186
+
187
+ ### Scenario shape
188
+
189
+ Each scenario typically contains:
190
+ - `id`: stable scenario identifier
191
+ - `name`: human-readable scenario name
192
+ - `turns`: ordered list of user turns to execute
193
+
194
+ Each turn supports:
195
+ - `label`: display label in reports
196
+ - `message`: the user message to send
197
+ - `idleTimeoutMs`: optional wait limit for message inactivity
198
+ - `processTimeoutMs`: optional hard timeout for the turn
199
+ - `assertions`: pass/fail checks for the turn output
200
+ - `eval`: quality scoring dimensions for the turn output
201
+ - `onComplete`: optional callback for turn-level post-processing
202
+
203
+ ### Message options
204
+
205
+ `message` can be:
206
+ - a function `(context) => string`
207
+ - `aiGen.fromPreviousContext()`
208
+ - `aiGen.fromGuidance(guidance)`
209
+ - `aiGen.fromContent(content)`
210
+
211
+ This gives you a few common scenario authoring patterns:
212
+ - fixed prompts for deterministic tests
213
+ - context-aware prompts that use scenario data
214
+ - generated user prompts for more adaptive multi-turn flows
215
+
216
+ Example with assertions and eval on a turn:
217
+
218
+ ```ts
219
+ import { AiScenarioSet, aiGen, type AiScenario, type BaseAiScenarioContext } from '@huydao/karrot';
220
+
221
+ const scenarios: AiScenario<BaseAiScenarioContext>[] = [
222
+ {
223
+ id: 'FOLLOW-UP-1',
224
+ name: 'Follow-up prompt generation',
225
+ turns: [
226
+ {
227
+ label: 'Ask for next prompts',
228
+ message: aiGen.fromGuidance(
229
+ 'Ask for 3 concise follow-up prompts the user can send next based on the previous answer.',
230
+ ),
231
+ assertions: [
232
+ { assert: { hasText: 'prompt' } },
233
+ ],
234
+ eval: ['correctness', 'helpfulness', 'relevance'],
235
+ },
236
+ ],
237
+ },
238
+ ];
239
+
240
+ export const scenarioSet = new AiScenarioSet(scenarios);
241
+
242
+ export function buildScenarioContext(baseContext: BaseAiScenarioContext): BaseAiScenarioContext {
243
+ return { ...baseContext };
93
244
  }
94
245
  ```
95
246
 
@@ -97,6 +248,8 @@ export function buildScenarioContext(projectId: string): BaseAiScenarioContext {
97
248
 
98
249
  Karrot supports two assertion styles.
99
250
 
251
+ Use assertions for pass/fail requirements. If a turn must contain or avoid something specific, assertions are the right tool.
252
+
100
253
  Direct assertions:
101
254
 
102
255
  ```ts
@@ -115,9 +268,15 @@ assertions: [
115
268
  ]
116
269
  ```
117
270
 
271
+ Assertion guidance:
272
+ - Use direct assertions when the expected output is deterministic enough to check literally.
273
+ - Use AI assertions when the requirement is semantic and cannot be captured safely with exact string matching.
274
+ - Use assertions to decide whether the turn satisfied a contract, not to measure answer quality.
275
+
118
276
  ## Evaluations
119
277
 
120
278
  Turn evals score the assistant response for named dimensions.
279
+ Karrot applies a CheckEval-inspired evaluation rubric: broad dimensions are decomposed into concrete checklist-style checks before assigning a final score, which improves consistency and makes explanations more traceable.
121
280
 
122
281
  ```ts
123
282
  eval: ['correctness', 'coverage', 'helpfulness']
@@ -135,12 +294,37 @@ eval: [
135
294
  ]
136
295
  ```
137
296
 
297
+ Use eval when you want a quality score rather than a hard pass/fail rule.
298
+
299
+ Built-in dimensions commonly used by Karrot:
300
+ - `correctness`
301
+ - `coverage`
302
+ - `helpfulness`
303
+ - `clarity`
304
+ - `completeness`
305
+ - `conciseness`
306
+ - `relevance`
307
+ - `actionability`
308
+ - `structure`
309
+ - `consistency`
310
+ - `safety`
311
+
138
312
  Project-level eval prompts can be configured through:
139
313
  - `evaluation.systemPromptPath`
140
314
  - `evaluation.promptDirectory`
141
315
 
142
316
  That lets the project define rubric files without repeating inline guidance in every scenario.
143
317
 
318
+ Use:
319
+ - `systemPromptPath` when you want to replace the whole turn-eval rubric
320
+ - `promptDirectory` when you want to add custom project-specific dimensions
321
+
322
+ Eval guidance:
323
+ - Use assertions for required behavior.
324
+ - Use eval for quality measurement across dimensions.
325
+ - Prefer a small number of dimensions that reflect the goal of the turn.
326
+ - Because Karrot applies CheckEval-style scoring, dimensions like `relevance` and `consistency` are judged through concrete sub-checks instead of a vague overall impression.
327
+
144
328
  ## AI-Generated User Messages
145
329
 
146
330
  Karrot can generate a user turn message before sending it to the target assistant.
@@ -173,39 +357,9 @@ Karrot config currently supports:
173
357
  - `context`
174
358
  - `report`
175
359
 
176
- Example `ag-ui-wss` config:
177
-
178
- ```yml
179
- version: 1
180
-
181
- transport:
182
- type: ag-ui-wss
183
- env:
184
- JWT: ${JWT}
185
- ACCOUNT_ID: ${ACCOUNT_ID}
186
- PROJECT_ID: ${PROJECT_ID}
187
- AGENT_URL: ${AGENT_URL}
188
- AGENT_ID: ${AGENT_ID}
189
- WS_URL: ${WS_URL}
190
- WS_TOPIC: ${WS_TOPIC}
191
- WS_STOMP_HEADERS: Authorization:${JWT}
192
- WS_HEADERS: Origin:${WS_ORIGIN},User-Agent:Mozilla/5.0
193
-
194
- context:
195
- projectId: ${PROJECT_ID}
196
-
197
- report:
198
- environment: prod
199
- projectName: Demo Project
200
- runtime:
201
- agentUrl: ${AGENT_URL}
202
- agentId: ${AGENT_ID}
203
- wsUrl: ${WS_URL}
204
- wsTopic: ${WS_TOPIC}
205
- accountId: ${ACCOUNT_ID}
206
- projectId: ${PROJECT_ID}
207
- appBaseUrl: ${APP_BASE_URL}
208
- ```
360
+ Important design choice:
361
+ - config and scenario are separate
362
+ - one transport config can be reused across many scenario files
209
363
 
210
364
  ## Reports and Artifacts
211
365
 
@@ -1,8 +1,12 @@
1
1
  import { type AiTurnAssertion } from '../scenarios/scenario';
2
2
  export type AssertionEvaluationResult = {
3
3
  kind: 'assert' | 'aiAssert';
4
- matcher: 'hasText' | 'toolcall' | 'hasContent' | 'notHasContent';
5
- expected: string | string[];
4
+ matcher: 'hasText' | 'toolcall' | 'toolcallWithContent' | 'hasContent' | 'notHasContent';
5
+ expected: string | string[] | {
6
+ name: string;
7
+ hasText?: string | string[];
8
+ hasProperties?: Record<string, unknown>;
9
+ };
6
10
  description?: string;
7
11
  passed: boolean;
8
12
  reason: string;
@@ -13,6 +17,7 @@ type EvaluateTurnAssertionsOptions = {
13
17
  toolCalls?: string[];
14
18
  env?: NodeJS.ProcessEnv;
15
19
  outputDirectory?: string;
20
+ outputPath?: string;
16
21
  };
17
22
  export declare function evaluateTurnAssertions(options: EvaluateTurnAssertionsOptions): Promise<AssertionEvaluationResult[]>;
18
23
  export {};
@@ -12,6 +12,14 @@ function normalizeAssertion(assertion) {
12
12
  description: assertion.description,
13
13
  };
14
14
  }
15
+ if ('toolcallWithContent' in assertion.assert) {
16
+ return {
17
+ kind: 'assert',
18
+ matcher: 'toolcallWithContent',
19
+ expected: assertion.assert.toolcallWithContent,
20
+ description: assertion.description,
21
+ };
22
+ }
15
23
  return {
16
24
  kind: 'assert',
17
25
  matcher: 'toolcall',
@@ -35,11 +43,32 @@ function normalizeAssertion(assertion) {
35
43
  };
36
44
  }
37
45
  function formatExpectedValue(expected) {
46
+ if (typeof expected === 'object' && expected !== null && !Array.isArray(expected)) {
47
+ return JSON.stringify(expected);
48
+ }
38
49
  if (Array.isArray(expected)) {
39
50
  return `[${expected.join(', ')}]`;
40
51
  }
41
52
  return `"${expected}"`;
42
53
  }
54
+ function isPlainObject(value) {
55
+ return typeof value === 'object' && value !== null && !Array.isArray(value);
56
+ }
57
+ function matchesExpectedProperties(actual, expected) {
58
+ if (Array.isArray(expected)) {
59
+ if (!Array.isArray(actual) || actual.length < expected.length) {
60
+ return false;
61
+ }
62
+ return expected.every((expectedItem, index) => matchesExpectedProperties(actual[index], expectedItem));
63
+ }
64
+ if (isPlainObject(expected)) {
65
+ if (!isPlainObject(actual)) {
66
+ return false;
67
+ }
68
+ return Object.entries(expected).every(([key, value]) => matchesExpectedProperties(actual[key], value));
69
+ }
70
+ return Object.is(actual, expected);
71
+ }
43
72
  function buildAssertionReason(result) {
44
73
  const prefix = result.description ? `${result.description}: ` : '';
45
74
  return `${prefix}${result.matcher}(${formatExpectedValue(result.expected)})`;
@@ -130,6 +159,18 @@ async function evaluateAiAssertion(assertion, output, env) {
130
159
  description: assertion.description,
131
160
  });
132
161
  }
162
+ async function readOutputLog(outputPath) {
163
+ if (!outputPath) {
164
+ return '';
165
+ }
166
+ try {
167
+ const { readFile } = await import('node:fs/promises');
168
+ return await readFile(outputPath, 'utf8');
169
+ }
170
+ catch {
171
+ return '';
172
+ }
173
+ }
133
174
  function evaluateToolCallAssertion(assertion, toolCalls) {
134
175
  const expectedToolCalls = (Array.isArray(assertion.expected) ? assertion.expected : [])
135
176
  .map((toolCall) => toolCall.trim())
@@ -166,11 +207,100 @@ function evaluateToolCallAssertion(assertion, toolCalls) {
166
207
  : `${buildNormalizedAssertionReason(normalizedToolCallAssertion)} failed. Missing: ${formatExpectedValue(missingToolCalls)}. Observed: ${formatExpectedValue(actualToolCalls)}.`,
167
208
  };
168
209
  }
210
+ function evaluateToolCallWithContentAssertion(assertion, logContent) {
211
+ const expected = typeof assertion.expected === 'object' && assertion.expected !== null && !Array.isArray(assertion.expected)
212
+ ? assertion.expected
213
+ : undefined;
214
+ if (!expected) {
215
+ return {
216
+ kind: assertion.kind,
217
+ matcher: assertion.matcher,
218
+ expected: assertion.expected,
219
+ description: assertion.description,
220
+ passed: false,
221
+ reason: `${buildNormalizedAssertionReason(assertion)} failed. Invalid toolcallWithContent expectation.`,
222
+ };
223
+ }
224
+ const expectedTexts = (Array.isArray(expected.hasText) ? expected.hasText : [expected.hasText])
225
+ .filter((value) => typeof value === 'string')
226
+ .map((value) => value.trim())
227
+ .filter(Boolean);
228
+ const toolCallNamesById = new Map();
229
+ const eventPayloadsByToolCallId = new Map();
230
+ const parsedArgumentsByToolCallId = new Map();
231
+ for (const rawLine of logContent.split('\n')) {
232
+ const line = rawLine.trim();
233
+ if (!line) {
234
+ continue;
235
+ }
236
+ try {
237
+ const event = JSON.parse(line);
238
+ if (!event.toolCallId) {
239
+ continue;
240
+ }
241
+ eventPayloadsByToolCallId.set(event.toolCallId, [
242
+ ...(eventPayloadsByToolCallId.get(event.toolCallId) ?? []),
243
+ line,
244
+ ]);
245
+ if (typeof event.toolCallName === 'string' && event.toolCallName.trim()) {
246
+ toolCallNamesById.set(event.toolCallId, event.toolCallName.trim());
247
+ }
248
+ if (typeof event.arguments === 'string' && event.arguments.trim()) {
249
+ try {
250
+ const parsedArguments = JSON.parse(event.arguments);
251
+ parsedArgumentsByToolCallId.set(event.toolCallId, [
252
+ ...(parsedArgumentsByToolCallId.get(event.toolCallId) ?? []),
253
+ parsedArguments,
254
+ ]);
255
+ }
256
+ catch {
257
+ // Ignore unparsable arguments and fall back to raw text matching.
258
+ }
259
+ }
260
+ }
261
+ catch {
262
+ continue;
263
+ }
264
+ }
265
+ const matchingToolCallIds = [...toolCallNamesById.entries()]
266
+ .filter(([, toolCallName]) => toolCallName === expected.name)
267
+ .map(([toolCallId]) => toolCallId);
268
+ if (matchingToolCallIds.length === 0) {
269
+ return {
270
+ kind: assertion.kind,
271
+ matcher: assertion.matcher,
272
+ expected,
273
+ description: assertion.description,
274
+ passed: false,
275
+ reason: `${buildNormalizedAssertionReason(assertion)} failed. Tool call "${expected.name}" was not found in the run log.`,
276
+ };
277
+ }
278
+ const combinedPayload = matchingToolCallIds
279
+ .flatMap((toolCallId) => eventPayloadsByToolCallId.get(toolCallId) ?? [])
280
+ .join('\n');
281
+ const missingTexts = expectedTexts.filter((text) => !combinedPayload.includes(text));
282
+ const hasPropertiesMatch = expected.hasProperties === undefined ||
283
+ matchingToolCallIds.some((toolCallId) => (parsedArgumentsByToolCallId.get(toolCallId) ?? []).some((parsedArguments) => matchesExpectedProperties(parsedArguments, expected.hasProperties)));
284
+ const passed = missingTexts.length === 0 && hasPropertiesMatch;
285
+ return {
286
+ kind: assertion.kind,
287
+ matcher: assertion.matcher,
288
+ expected,
289
+ description: assertion.description,
290
+ passed,
291
+ reason: passed
292
+ ? `${buildNormalizedAssertionReason(assertion)} passed.`
293
+ : !hasPropertiesMatch
294
+ ? `${buildNormalizedAssertionReason(assertion)} failed. Missing properties: ${JSON.stringify(expected.hasProperties ?? {})}.`
295
+ : `${buildNormalizedAssertionReason(assertion)} failed. Missing text: ${formatExpectedValue(missingTexts)}.`,
296
+ };
297
+ }
169
298
  async function evaluateTurnAssertions(options) {
170
299
  if (!options.assertions?.length) {
171
300
  return [];
172
301
  }
173
302
  const results = [];
303
+ let outputLogContent;
174
304
  for (const rawAssertion of options.assertions) {
175
305
  const assertion = normalizeAssertion(rawAssertion);
176
306
  if (assertion.kind === 'assert') {
@@ -178,7 +308,19 @@ async function evaluateTurnAssertions(options) {
178
308
  results.push(evaluateToolCallAssertion(assertion, options.toolCalls));
179
309
  continue;
180
310
  }
181
- const expected = typeof assertion.expected === 'string' ? assertion.expected : assertion.expected.join(', ');
311
+ if (assertion.matcher === 'toolcallWithContent') {
312
+ outputLogContent ??= await readOutputLog(options.outputPath);
313
+ results.push(evaluateToolCallWithContentAssertion(assertion, outputLogContent));
314
+ continue;
315
+ }
316
+ if (assertion.matcher !== 'hasText') {
317
+ continue;
318
+ }
319
+ const expected = typeof assertion.expected === 'string'
320
+ ? assertion.expected
321
+ : Array.isArray(assertion.expected)
322
+ ? assertion.expected.join(', ')
323
+ : JSON.stringify(assertion.expected);
182
324
  const passed = options.output.includes(expected);
183
325
  results.push({
184
326
  kind: assertion.kind,
@@ -8,8 +8,8 @@ type RunAgUiMessageOptions = {
8
8
  allowIdleTimeoutWithAssistantText?: boolean;
9
9
  processTimeoutMs?: number;
10
10
  };
11
- export declare function parseExecutionTestResultId(output: string): string | undefined;
11
+ declare function parseExecutionTestResultId(output: string): string | undefined;
12
+ export { parseExecutionTestResultId };
12
13
  export declare function extractToolCallNames(logContent: string): string[];
13
14
  export declare function extractAppendedLog(previousLogContent: string, latestLogContent: string): string;
14
15
  export declare function runAgUiMessage(options: RunAgUiMessageOptions): Promise<MessageRunResult>;
15
- export {};