@huydao/karrot 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/GUIDE.md +484 -0
- package/README.md +253 -0
- package/dist/assertions/assertion.d.ts +18 -0
- package/dist/assertions/assertion.js +198 -0
- package/dist/assertions/turn-eval.d.ts +22 -0
- package/dist/assertions/turn-eval.js +178 -0
- package/dist/executors/adapters/ag-ui-post.d.ts +55 -0
- package/dist/executors/adapters/ag-ui-post.js +703 -0
- package/dist/executors/adapters/ag-ui.d.ts +15 -0
- package/dist/executors/adapters/ag-ui.js +275 -0
- package/dist/executors/execute.d.ts +16 -0
- package/dist/executors/execute.js +145 -0
- package/dist/executors/executor.d.ts +37 -0
- package/dist/executors/executor.js +203 -0
- package/dist/executors/run-result.d.ts +33 -0
- package/dist/executors/run-result.js +22 -0
- package/dist/index.d.ts +12 -0
- package/dist/index.js +28 -0
- package/dist/prompts/turn-eval-system-prompt.md +68 -0
- package/dist/prompts/turn-message-gen-system-prompt.md +16 -0
- package/dist/reports/report.d.ts +68 -0
- package/dist/reports/report.js +366 -0
- package/dist/scenarios/generated-message.d.ts +15 -0
- package/dist/scenarios/generated-message.js +116 -0
- package/dist/scenarios/scenario-loader.d.ts +12 -0
- package/dist/scenarios/scenario-loader.js +103 -0
- package/dist/scenarios/scenario.d.ts +62 -0
- package/dist/scenarios/scenario.js +35 -0
- package/dist/utils/artifact-files.d.ts +3 -0
- package/dist/utils/artifact-files.js +22 -0
- package/dist/utils/config.d.ts +101 -0
- package/dist/utils/config.js +57 -0
- package/dist/utils/openai-eval.d.ts +5 -0
- package/dist/utils/openai-eval.js +54 -0
- package/package.json +146 -0
package/GUIDE.md
ADDED
|
@@ -0,0 +1,484 @@
|
|
|
1
|
+
# Karrot Guide
|
|
2
|
+
|
|
3
|
+
This file is the operational reference for using `karrot`.
|
|
4
|
+
|
|
5
|
+
It is written so:
|
|
6
|
+
- a human engineer can onboard quickly
|
|
7
|
+
- an AI agent can read the file and infer the expected integration pattern
|
|
8
|
+
|
|
9
|
+
## 1. Purpose
|
|
10
|
+
|
|
11
|
+
Use `karrot` when you need a reusable AI-level test framework that can:
|
|
12
|
+
- send one or more user turns to an assistant
|
|
13
|
+
- collect assistant output
|
|
14
|
+
- assert response quality
|
|
15
|
+
- evaluate responses with named metrics
|
|
16
|
+
- write stable artifacts and reports
|
|
17
|
+
|
|
18
|
+
Use cases:
|
|
19
|
+
- product AI chat testing
|
|
20
|
+
- multi-turn assistant regression testing
|
|
21
|
+
- eval-based quality monitoring
|
|
22
|
+
- prompt or orchestration regression checks
|
|
23
|
+
|
|
24
|
+
## 2. Boundary
|
|
25
|
+
|
|
26
|
+
`karrot` owns:
|
|
27
|
+
- scenario modeling
|
|
28
|
+
- scenario execution
|
|
29
|
+
- transport execution
|
|
30
|
+
- assertion and eval execution
|
|
31
|
+
- report generation
|
|
32
|
+
|
|
33
|
+
The consumer project owns:
|
|
34
|
+
- auth/login
|
|
35
|
+
- runtime discovery
|
|
36
|
+
- transport secrets and IDs
|
|
37
|
+
- project-specific scenario context
|
|
38
|
+
- transport variable preparation
|
|
39
|
+
|
|
40
|
+
Do not put product-specific auth logic into `karrot`.
|
|
41
|
+
|
|
42
|
+
## 3. Primary Public API
|
|
43
|
+
|
|
44
|
+
Use these exports first:
|
|
45
|
+
- `execute`
|
|
46
|
+
- `AiScenarioSet`
|
|
47
|
+
- `aiGen`
|
|
48
|
+
|
|
49
|
+
Use lower-level APIs only when needed:
|
|
50
|
+
- `runScenario`
|
|
51
|
+
- `loadConfig`
|
|
52
|
+
- `resolveVariables`
|
|
53
|
+
- `writeScenarioRunReport`
|
|
54
|
+
|
|
55
|
+
Preferred pattern:
|
|
56
|
+
|
|
57
|
+
```ts
|
|
58
|
+
import { execute } from '@huydao/karrot';
|
|
59
|
+
|
|
60
|
+
await execute('./karrot.config.yml', {
|
|
61
|
+
variables,
|
|
62
|
+
scenario: {
|
|
63
|
+
file: './src/scenarios/my-scenarios.ts',
|
|
64
|
+
ids: ['S1'],
|
|
65
|
+
},
|
|
66
|
+
});
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## 4. Required Scenario Module Shape
|
|
70
|
+
|
|
71
|
+
A scenario module must export:
|
|
72
|
+
- `scenarioSet`
|
|
73
|
+
- `buildScenarioContext(projectId)`
|
|
74
|
+
|
|
75
|
+
Example:
|
|
76
|
+
|
|
77
|
+
```ts
|
|
78
|
+
import { AiScenarioSet, type AiScenario, type BaseAiScenarioContext } from '@huydao/karrot';
|
|
79
|
+
|
|
80
|
+
type DemoContext = BaseAiScenarioContext & {
|
|
81
|
+
projectLabel: string;
|
|
82
|
+
};
|
|
83
|
+
|
|
84
|
+
const scenarios: AiScenario<DemoContext>[] = [
|
|
85
|
+
{
|
|
86
|
+
id: 'S1',
|
|
87
|
+
name: 'Greeting flow',
|
|
88
|
+
turns: [
|
|
89
|
+
{
|
|
90
|
+
label: 'Turn 1',
|
|
91
|
+
message: (context) => `Tell me about ${context.projectLabel}.`,
|
|
92
|
+
},
|
|
93
|
+
],
|
|
94
|
+
},
|
|
95
|
+
];
|
|
96
|
+
|
|
97
|
+
export const scenarioSet = new AiScenarioSet(scenarios);
|
|
98
|
+
|
|
99
|
+
export function buildScenarioContext(projectId: string): DemoContext {
|
|
100
|
+
return {
|
|
101
|
+
projectId,
|
|
102
|
+
projectLabel: 'RA Sample Project',
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## 5. Turn Model
|
|
108
|
+
|
|
109
|
+
Each turn supports:
|
|
110
|
+
- `label`
|
|
111
|
+
- `message`
|
|
112
|
+
- `idleTimeoutMs`
|
|
113
|
+
- `processTimeoutMs`
|
|
114
|
+
- `assertions`
|
|
115
|
+
- `eval`
|
|
116
|
+
- `onComplete`
|
|
117
|
+
|
|
118
|
+
### Message forms
|
|
119
|
+
|
|
120
|
+
`message` can be:
|
|
121
|
+
- a function `(context) => string`
|
|
122
|
+
- `aiGen.fromPreviousContext()`
|
|
123
|
+
- `aiGen.fromGuidance(guidance)`
|
|
124
|
+
- `aiGen.fromContent(content)`
|
|
125
|
+
|
|
126
|
+
### Example
|
|
127
|
+
|
|
128
|
+
```ts
|
|
129
|
+
{
|
|
130
|
+
label: 'Turn 2',
|
|
131
|
+
message: aiGen.fromGuidance(
|
|
132
|
+
'Ask for 3 follow-up prompts the user can send next.',
|
|
133
|
+
),
|
|
134
|
+
assertions: [
|
|
135
|
+
{ assert: { hasText: 'prompt' } },
|
|
136
|
+
],
|
|
137
|
+
eval: ['correctness', 'helpfulness'],
|
|
138
|
+
}
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
## 6. Assertions
|
|
142
|
+
|
|
143
|
+
Karrot supports two assertion families.
|
|
144
|
+
|
|
145
|
+
### Direct assertions
|
|
146
|
+
|
|
147
|
+
Use direct assertions when the expected outcome is deterministic enough.
|
|
148
|
+
|
|
149
|
+
Supported forms:
|
|
150
|
+
- `assert.hasText`
|
|
151
|
+
- `assert.toolcall`
|
|
152
|
+
|
|
153
|
+
Examples:
|
|
154
|
+
|
|
155
|
+
```ts
|
|
156
|
+
assertions: [
|
|
157
|
+
{ assert: { hasText: 'Katalon AI' } },
|
|
158
|
+
{ assert: { toolcall: ['search_document'] } },
|
|
159
|
+
{ assert: { toolcall: [] } },
|
|
160
|
+
]
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
### AI assertions
|
|
164
|
+
|
|
165
|
+
Use AI assertions when the check is semantic.
|
|
166
|
+
|
|
167
|
+
Supported forms:
|
|
168
|
+
- `aiAssert.hasContent`
|
|
169
|
+
- `aiAssert.notHasContent`
|
|
170
|
+
|
|
171
|
+
Examples:
|
|
172
|
+
|
|
173
|
+
```ts
|
|
174
|
+
assertions: [
|
|
175
|
+
{
|
|
176
|
+
aiAssert: {
|
|
177
|
+
hasContent: 'The answer should describe concrete next steps for the user.',
|
|
178
|
+
},
|
|
179
|
+
},
|
|
180
|
+
{
|
|
181
|
+
aiAssert: {
|
|
182
|
+
notHasContent: 'The answer invents unsupported product capabilities.',
|
|
183
|
+
},
|
|
184
|
+
},
|
|
185
|
+
]
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
## 7. Evaluations
|
|
189
|
+
|
|
190
|
+
Eval is separate from assertion.
|
|
191
|
+
|
|
192
|
+
Use assertion to decide pass or fail against a requirement.
|
|
193
|
+
Use eval to score response quality along named dimensions.
|
|
194
|
+
|
|
195
|
+
### Built-in dimensions
|
|
196
|
+
|
|
197
|
+
Common built-in dimensions:
|
|
198
|
+
- `correctness`
|
|
199
|
+
- `coverage`
|
|
200
|
+
- `helpfulness`
|
|
201
|
+
- `clarity`
|
|
202
|
+
- `completeness`
|
|
203
|
+
- `conciseness`
|
|
204
|
+
- `relevance`
|
|
205
|
+
- `actionability`
|
|
206
|
+
- `structure`
|
|
207
|
+
- `consistency`
|
|
208
|
+
- `safety`
|
|
209
|
+
|
|
210
|
+
### Custom dimensions
|
|
211
|
+
|
|
212
|
+
Use either:
|
|
213
|
+
- inline guidance
|
|
214
|
+
- project-level prompt files
|
|
215
|
+
|
|
216
|
+
Inline guidance example:
|
|
217
|
+
|
|
218
|
+
```ts
|
|
219
|
+
eval: [
|
|
220
|
+
'correctness',
|
|
221
|
+
{
|
|
222
|
+
dimension: 'productFit',
|
|
223
|
+
guidance: 'Judge whether the answer is specifically useful for this product domain.',
|
|
224
|
+
},
|
|
225
|
+
]
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
Project-level prompt example:
|
|
229
|
+
|
|
230
|
+
```yml
|
|
231
|
+
evaluation:
|
|
232
|
+
promptDirectory: ./prompts/eval
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
Then add files such as:
|
|
236
|
+
- `./prompts/eval/product-fit.md`
|
|
237
|
+
- `./prompts/eval/next-step-quality.md`
|
|
238
|
+
|
|
239
|
+
Scenario then only needs:
|
|
240
|
+
|
|
241
|
+
```ts
|
|
242
|
+
eval: ['correctness', 'productFit']
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
### Full prompt override
|
|
246
|
+
|
|
247
|
+
If the whole eval rubric changes, use:
|
|
248
|
+
|
|
249
|
+
```yml
|
|
250
|
+
evaluation:
|
|
251
|
+
systemPromptPath: ./prompts/turn-eval-testcase-generation-system-prompt.md
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
Use `systemPromptPath` for a full override.
|
|
255
|
+
Use `promptDirectory` for additive project-specific guidance.
|
|
256
|
+
|
|
257
|
+
## 8. Config Shape
|
|
258
|
+
|
|
259
|
+
Karrot config is versioned.
|
|
260
|
+
|
|
261
|
+
Current shape:
|
|
262
|
+
|
|
263
|
+
```yml
|
|
264
|
+
version: 1
|
|
265
|
+
transport: ...
|
|
266
|
+
artifacts:
|
|
267
|
+
directory: ./artifacts
|
|
268
|
+
execution:
|
|
269
|
+
stopOnFailure: false
|
|
270
|
+
evaluation:
|
|
271
|
+
systemPromptPath: ./prompts/...
|
|
272
|
+
promptDirectory: ./prompts/eval
|
|
273
|
+
context:
|
|
274
|
+
projectId: ${PROJECT_ID}
|
|
275
|
+
report:
|
|
276
|
+
enabled: true
|
|
277
|
+
environment: prod
|
|
278
|
+
projectName: Demo Project
|
|
279
|
+
runtime: ...
|
|
280
|
+
scenarioContext: ...
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
Important design choice:
|
|
284
|
+
- config and scenario are separate
|
|
285
|
+
- `execute()` receives the scenario file independently
|
|
286
|
+
|
|
287
|
+
That allows one transport config to run many scenario files.
|
|
288
|
+
|
|
289
|
+
## 9. Variable Resolution
|
|
290
|
+
|
|
291
|
+
Karrot resolves `${VARIABLE}` from:
|
|
292
|
+
- `options.variables`
|
|
293
|
+
- then `process.env`
|
|
294
|
+
|
|
295
|
+
Example:
|
|
296
|
+
|
|
297
|
+
```yml
|
|
298
|
+
context:
|
|
299
|
+
projectId: ${PROJECT_ID}
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
```ts
|
|
303
|
+
await execute('./karrot.config.yml', {
|
|
304
|
+
variables: {
|
|
305
|
+
PROJECT_ID: '3422056',
|
|
306
|
+
},
|
|
307
|
+
scenario: {
|
|
308
|
+
file: './src/scenarios/basic-two-turn-demo.ts',
|
|
309
|
+
},
|
|
310
|
+
});
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
## 10. Transport Patterns
|
|
314
|
+
|
|
315
|
+
### AG-UI-WSS
|
|
316
|
+
|
|
317
|
+
Use when the target assistant follows the AG-UI WSS contract.
|
|
318
|
+
|
|
319
|
+
Config shape:
|
|
320
|
+
|
|
321
|
+
```yml
|
|
322
|
+
transport:
|
|
323
|
+
type: ag-ui-wss
|
|
324
|
+
env:
|
|
325
|
+
JWT: ${JWT}
|
|
326
|
+
ACCOUNT_ID: ${ACCOUNT_ID}
|
|
327
|
+
PROJECT_ID: ${PROJECT_ID}
|
|
328
|
+
AGENT_URL: ${AGENT_URL}
|
|
329
|
+
AGENT_ID: ${AGENT_ID}
|
|
330
|
+
WS_URL: ${WS_URL}
|
|
331
|
+
WS_TOPIC: ${WS_TOPIC}
|
|
332
|
+
processTimeoutMs: 120000
|
|
333
|
+
```
|
|
334
|
+
|
|
335
|
+
Notes:
|
|
336
|
+
- `karrot` uses `ag-ui-wss` as the primary transport
|
|
337
|
+
- raw AG-UI events are written to `.jsonl`
|
|
338
|
+
- assistant output is parsed from `TEXT_MESSAGE_CONTENT`
|
|
339
|
+
|
|
340
|
+
### AG-UI-POST
|
|
341
|
+
|
|
342
|
+
Use when the target assistant exposes AG-UI-style HTTP endpoints.
|
|
343
|
+
|
|
344
|
+
Karrot supports:
|
|
345
|
+
- submit-only
|
|
346
|
+
- submit + connect
|
|
347
|
+
- submit + observe poll
|
|
348
|
+
|
|
349
|
+
Use this only when the project runtime requires HTTP-based AG-UI, not WSS.
|
|
350
|
+
|
|
351
|
+
## 11. Artifacts and Reports
|
|
352
|
+
|
|
353
|
+
Each run creates:
|
|
354
|
+
- `artifacts/<timestamp>/`
|
|
355
|
+
- raw transport logs
|
|
356
|
+
- one JSON report
|
|
357
|
+
- one HTML report
|
|
358
|
+
|
|
359
|
+
The report includes:
|
|
360
|
+
- environment
|
|
361
|
+
- runtime snapshot
|
|
362
|
+
- scenario context
|
|
363
|
+
- scenario status
|
|
364
|
+
- turn outputs
|
|
365
|
+
- assertion results
|
|
366
|
+
- eval results
|
|
367
|
+
- timing metrics
|
|
368
|
+
|
|
369
|
+
## 12. OpenAI Requirements
|
|
370
|
+
|
|
371
|
+
OpenAI features are used for:
|
|
372
|
+
- AI assertions
|
|
373
|
+
- turn evaluation
|
|
374
|
+
- generated user messages
|
|
375
|
+
|
|
376
|
+
Required:
|
|
377
|
+
- `OPENAI_API_KEY`
|
|
378
|
+
|
|
379
|
+
Optional:
|
|
380
|
+
- `OPENAI_BASE_URL`
|
|
381
|
+
- `OPENAI_EVAL_MODEL`
|
|
382
|
+
- `OPENAI_MESSAGE_GEN_MODEL`
|
|
383
|
+
|
|
384
|
+
Defaults:
|
|
385
|
+
- eval model: `gpt-5.4`
|
|
386
|
+
- message generation model: `gpt-5.4-mini`
|
|
387
|
+
|
|
388
|
+
## 13. Recommended Integration Pattern
|
|
389
|
+
|
|
390
|
+
For a new consumer project:
|
|
391
|
+
|
|
392
|
+
1. prepare auth/runtime outside `karrot`
|
|
393
|
+
2. map runtime values to variables
|
|
394
|
+
3. create a `karrot.config.yml`
|
|
395
|
+
4. create one scenario module
|
|
396
|
+
5. call `execute()`
|
|
397
|
+
|
|
398
|
+
Example structure:
|
|
399
|
+
|
|
400
|
+
```text
|
|
401
|
+
consumer-project/
|
|
402
|
+
├── karrot.config.yml
|
|
403
|
+
├── prompts/
|
|
404
|
+
│ └── eval/
|
|
405
|
+
├── src/
|
|
406
|
+
│ ├── runtime/
|
|
407
|
+
│ ├── run-demo.ts
|
|
408
|
+
│ └── scenarios/
|
|
409
|
+
└── artifacts/
|
|
410
|
+
```
|
|
411
|
+
|
|
412
|
+
## 14. Debugging Checklist
|
|
413
|
+
|
|
414
|
+
If a run fails:
|
|
415
|
+
|
|
416
|
+
1. inspect the JSON report
|
|
417
|
+
2. inspect the HTML report
|
|
418
|
+
3. inspect raw transport files in the artifact directory
|
|
419
|
+
4. verify the scenario message that was actually sent
|
|
420
|
+
5. verify transport variables after `${...}` resolution
|
|
421
|
+
6. verify `OPENAI_API_KEY` if AI assertion, eval, or message generation is used
|
|
422
|
+
|
|
423
|
+
For AG-UI-WSS specifically:
|
|
424
|
+
- confirm JSONL contains `TEXT_MESSAGE_CONTENT`
|
|
425
|
+
- confirm `WS_TOPIC` and auth headers are correct
|
|
426
|
+
- do not treat transport progress logs as assistant output
|
|
427
|
+
|
|
428
|
+
## 15. Rules for AI Agents
|
|
429
|
+
|
|
430
|
+
If you are an AI agent editing or using `karrot`, follow these rules:
|
|
431
|
+
|
|
432
|
+
1. Keep product-specific login/runtime discovery outside `karrot`.
|
|
433
|
+
2. Prefer `execute()` unless there is a concrete reason to use lower-level APIs.
|
|
434
|
+
3. Keep transport config in YAML and scenario selection separate from config.
|
|
435
|
+
4. Use scenario context for product data, not hard-coded strings scattered across turns.
|
|
436
|
+
5. Use `promptDirectory` for project-specific eval rubric extensions.
|
|
437
|
+
6. Use `systemPromptPath` only when replacing the entire eval rubric.
|
|
438
|
+
7. Prefer direct assertions when exact checks are stable; use AI assertions only for semantic checks.
|
|
439
|
+
8. Preserve raw artifacts. Do not hide transport logs needed for debugging.
|
|
440
|
+
|
|
441
|
+
## 16. Quick Reference
|
|
442
|
+
|
|
443
|
+
### Minimal execution
|
|
444
|
+
|
|
445
|
+
```ts
|
|
446
|
+
await execute('./karrot.config.yml', {
|
|
447
|
+
variables,
|
|
448
|
+
scenario: {
|
|
449
|
+
file: './src/scenarios/basic-two-turn-demo.ts',
|
|
450
|
+
},
|
|
451
|
+
});
|
|
452
|
+
```
|
|
453
|
+
|
|
454
|
+
### Select specific scenarios
|
|
455
|
+
|
|
456
|
+
```ts
|
|
457
|
+
await execute('./karrot.config.yml', {
|
|
458
|
+
variables,
|
|
459
|
+
scenario: {
|
|
460
|
+
file: './src/scenarios/my-scenarios.ts',
|
|
461
|
+
ids: ['S1', 'S3'],
|
|
462
|
+
},
|
|
463
|
+
});
|
|
464
|
+
```
|
|
465
|
+
|
|
466
|
+
### Generate a user message
|
|
467
|
+
|
|
468
|
+
```ts
|
|
469
|
+
message: aiGen.fromGuidance('Ask for a shorter follow-up question.')
|
|
470
|
+
```
|
|
471
|
+
|
|
472
|
+
### Use project prompt files
|
|
473
|
+
|
|
474
|
+
```yml
|
|
475
|
+
evaluation:
|
|
476
|
+
promptDirectory: ./prompts/eval
|
|
477
|
+
```
|
|
478
|
+
|
|
479
|
+
### Override full eval system prompt
|
|
480
|
+
|
|
481
|
+
```yml
|
|
482
|
+
evaluation:
|
|
483
|
+
systemPromptPath: ./prompts/turn-eval-testcase-generation-system-prompt.md
|
|
484
|
+
```
|
package/README.md
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
# karrot
|
|
2
|
+
|
|
3
|
+
`karrot` is a reusable AI test runner for multi-turn assistant scenarios.
|
|
4
|
+
|
|
5
|
+
It gives you:
|
|
6
|
+
- scenario execution
|
|
7
|
+
- AG-UI transport integration
|
|
8
|
+
- string and AI-based assertions
|
|
9
|
+
- turn evaluation with OpenAI
|
|
10
|
+
- JSON and HTML reports
|
|
11
|
+
|
|
12
|
+
This package is designed to be published independently and reused across projects.
|
|
13
|
+
|
|
14
|
+
## What Karrot Owns
|
|
15
|
+
|
|
16
|
+
`karrot` is responsible for the AI-test layer:
|
|
17
|
+
- load config
|
|
18
|
+
- resolve `${VARIABLE}` templates
|
|
19
|
+
- load scenario modules
|
|
20
|
+
- execute turns
|
|
21
|
+
- run assertions and evals
|
|
22
|
+
- write artifacts and reports
|
|
23
|
+
|
|
24
|
+
`karrot` does not own product-specific runtime discovery. The consumer project should prepare data such as:
|
|
25
|
+
- `PROJECT_ID`
|
|
26
|
+
- `JWT`
|
|
27
|
+
- `ACCOUNT_ID`
|
|
28
|
+
- `WS_URL`
|
|
29
|
+
- `WS_TOPIC`
|
|
30
|
+
- any transport-specific headers or IDs
|
|
31
|
+
|
|
32
|
+
## Core Entry Point
|
|
33
|
+
|
|
34
|
+
The main high-level API is `execute()`.
|
|
35
|
+
|
|
36
|
+
```ts
|
|
37
|
+
import { execute } from '@huydao/karrot';
|
|
38
|
+
|
|
39
|
+
await execute('./karrot.config.yml', {
|
|
40
|
+
variables: {
|
|
41
|
+
PROJECT_ID: '3422056',
|
|
42
|
+
JWT: process.env.JWT,
|
|
43
|
+
ACCOUNT_ID: process.env.ACCOUNT_ID,
|
|
44
|
+
WS_URL: process.env.WS_URL,
|
|
45
|
+
WS_TOPIC: process.env.WS_TOPIC,
|
|
46
|
+
},
|
|
47
|
+
scenario: {
|
|
48
|
+
file: './src/scenarios/basic-two-turn-demo.ts',
|
|
49
|
+
},
|
|
50
|
+
});
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
`execute()` will:
|
|
54
|
+
1. load YAML or JSON config
|
|
55
|
+
2. resolve `${...}` variables
|
|
56
|
+
3. create `artifacts/<timestamp>`
|
|
57
|
+
4. load the scenario module
|
|
58
|
+
5. run selected scenarios
|
|
59
|
+
6. write JSON and HTML reports
|
|
60
|
+
|
|
61
|
+
## Scenario Authoring
|
|
62
|
+
|
|
63
|
+
A scenario module exports:
|
|
64
|
+
- `scenarioSet`
|
|
65
|
+
- `buildScenarioContext(projectId)`
|
|
66
|
+
|
|
67
|
+
Minimal example:
|
|
68
|
+
|
|
69
|
+
```ts
|
|
70
|
+
import { AiScenarioSet, type AiScenario, type BaseAiScenarioContext } from '@huydao/karrot';
|
|
71
|
+
|
|
72
|
+
const scenarios: AiScenario<BaseAiScenarioContext>[] = [
|
|
73
|
+
{
|
|
74
|
+
id: 'BASIC-2T',
|
|
75
|
+
name: 'Basic Two-Turn Demo',
|
|
76
|
+
turns: [
|
|
77
|
+
{
|
|
78
|
+
label: 'Turn 1',
|
|
79
|
+
message: () => 'Hello. What can you help me with in Katalon AI?',
|
|
80
|
+
},
|
|
81
|
+
{
|
|
82
|
+
label: 'Turn 2',
|
|
83
|
+
message: () => 'Give me 3 short example prompts I can ask next.',
|
|
84
|
+
},
|
|
85
|
+
],
|
|
86
|
+
},
|
|
87
|
+
];
|
|
88
|
+
|
|
89
|
+
export const scenarioSet = new AiScenarioSet(scenarios);
|
|
90
|
+
|
|
91
|
+
export function buildScenarioContext(projectId: string): BaseAiScenarioContext {
|
|
92
|
+
return { projectId };
|
|
93
|
+
}
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Assertions
|
|
97
|
+
|
|
98
|
+
Karrot supports two assertion styles.
|
|
99
|
+
|
|
100
|
+
Direct assertions:
|
|
101
|
+
|
|
102
|
+
```ts
|
|
103
|
+
assertions: [
|
|
104
|
+
{ assert: { hasText: 'Katalon AI' } },
|
|
105
|
+
{ assert: { toolcall: [] } },
|
|
106
|
+
]
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
AI assertions:
|
|
110
|
+
|
|
111
|
+
```ts
|
|
112
|
+
assertions: [
|
|
113
|
+
{ aiAssert: { hasContent: 'The answer explains what Katalon AI can do.' } },
|
|
114
|
+
{ aiAssert: { notHasContent: 'The answer invents unsupported product features.' } },
|
|
115
|
+
]
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Evaluations
|
|
119
|
+
|
|
120
|
+
Turn evals score the assistant response for named dimensions.
|
|
121
|
+
|
|
122
|
+
```ts
|
|
123
|
+
eval: ['correctness', 'coverage', 'helpfulness']
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
Custom dimensions are also supported:
|
|
127
|
+
|
|
128
|
+
```ts
|
|
129
|
+
eval: [
|
|
130
|
+
'correctness',
|
|
131
|
+
{
|
|
132
|
+
dimension: 'productFit',
|
|
133
|
+
guidance: 'Judge whether the answer is specifically useful for a Katalon AI user.',
|
|
134
|
+
},
|
|
135
|
+
]
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
Project-level eval prompts can be configured through:
|
|
139
|
+
- `evaluation.systemPromptPath`
|
|
140
|
+
- `evaluation.promptDirectory`
|
|
141
|
+
|
|
142
|
+
That lets the project define rubric files without repeating inline guidance in every scenario.
|
|
143
|
+
|
|
144
|
+
## AI-Generated User Messages
|
|
145
|
+
|
|
146
|
+
Karrot can generate a user turn message before sending it to the target assistant.
|
|
147
|
+
|
|
148
|
+
Available helpers:
|
|
149
|
+
- `aiGen.fromPreviousContext()`
|
|
150
|
+
- `aiGen.fromGuidance(guidance)`
|
|
151
|
+
- `aiGen.fromContent(content)`
|
|
152
|
+
|
|
153
|
+
Example:
|
|
154
|
+
|
|
155
|
+
```ts
|
|
156
|
+
import { aiGen } from '@huydao/karrot';
|
|
157
|
+
|
|
158
|
+
message: aiGen.fromGuidance(
|
|
159
|
+
'Ask for 3 concise follow-up prompts the user can send next based on the previous answer.',
|
|
160
|
+
)
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
This requires `OPENAI_API_KEY`.
|
|
164
|
+
|
|
165
|
+
## Config Overview
|
|
166
|
+
|
|
167
|
+
Karrot config currently supports:
|
|
168
|
+
- `transport`
|
|
169
|
+
- `artifacts.directory`
|
|
170
|
+
- `execution.stopOnFailure`
|
|
171
|
+
- `evaluation.systemPromptPath`
|
|
172
|
+
- `evaluation.promptDirectory`
|
|
173
|
+
- `context`
|
|
174
|
+
- `report`
|
|
175
|
+
|
|
176
|
+
Example `ag-ui-wss` config:
|
|
177
|
+
|
|
178
|
+
```yml
|
|
179
|
+
version: 1
|
|
180
|
+
|
|
181
|
+
transport:
|
|
182
|
+
type: ag-ui-wss
|
|
183
|
+
env:
|
|
184
|
+
JWT: ${JWT}
|
|
185
|
+
ACCOUNT_ID: ${ACCOUNT_ID}
|
|
186
|
+
PROJECT_ID: ${PROJECT_ID}
|
|
187
|
+
AGENT_URL: ${AGENT_URL}
|
|
188
|
+
AGENT_ID: ${AGENT_ID}
|
|
189
|
+
WS_URL: ${WS_URL}
|
|
190
|
+
WS_TOPIC: ${WS_TOPIC}
|
|
191
|
+
WS_STOMP_HEADERS: Authorization:${JWT}
|
|
192
|
+
WS_HEADERS: Origin:${WS_ORIGIN},User-Agent:Mozilla/5.0
|
|
193
|
+
|
|
194
|
+
context:
|
|
195
|
+
projectId: ${PROJECT_ID}
|
|
196
|
+
|
|
197
|
+
report:
|
|
198
|
+
environment: prod
|
|
199
|
+
projectName: Demo Project
|
|
200
|
+
runtime:
|
|
201
|
+
agentUrl: ${AGENT_URL}
|
|
202
|
+
agentId: ${AGENT_ID}
|
|
203
|
+
wsUrl: ${WS_URL}
|
|
204
|
+
wsTopic: ${WS_TOPIC}
|
|
205
|
+
accountId: ${ACCOUNT_ID}
|
|
206
|
+
projectId: ${PROJECT_ID}
|
|
207
|
+
appBaseUrl: ${APP_BASE_URL}
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
## Reports and Artifacts
|
|
211
|
+
|
|
212
|
+
Each `execute()` run creates:
|
|
213
|
+
- a run artifact directory under `artifacts/<timestamp>`
|
|
214
|
+
- raw transport logs such as `.jsonl` or `.sse`
|
|
215
|
+
- a JSON run report
|
|
216
|
+
- an HTML run report
|
|
217
|
+
|
|
218
|
+
## Environment Variables
|
|
219
|
+
|
|
220
|
+
Common variables:
|
|
221
|
+
- `OPENAI_API_KEY`
|
|
222
|
+
- `OPENAI_BASE_URL`
|
|
223
|
+
- `OPENAI_EVAL_MODEL`
|
|
224
|
+
- `OPENAI_MESSAGE_GEN_MODEL`
|
|
225
|
+
|
|
226
|
+
Transport-specific variables depend on the integration project.
|
|
227
|
+
|
|
228
|
+
## Package Structure
|
|
229
|
+
|
|
230
|
+
- `assertions/`: direct assertions and turn evaluation
|
|
231
|
+
- `executors/`: transport runners and scenario execution
|
|
232
|
+
- `reports/`: JSON and HTML reporting
|
|
233
|
+
- `scenarios/`: scenario types, loaders, generated-message helpers
|
|
234
|
+
- `utils/`: config loading, artifacts, OpenAI helpers
|
|
235
|
+
- `prompts/`: built-in prompt files used by the package
|
|
236
|
+
|
|
237
|
+
## AI-Friendly Guide
|
|
238
|
+
|
|
239
|
+
For a fuller operational guide intended for both humans and AI agents, read [GUIDE.md](./GUIDE.md).
|
|
240
|
+
|
|
241
|
+
## Build
|
|
242
|
+
|
|
243
|
+
```bash
|
|
244
|
+
cd karrot
|
|
245
|
+
npx tsc -p tsconfig.json
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
## Publish
|
|
249
|
+
|
|
250
|
+
```bash
|
|
251
|
+
cd karrot
|
|
252
|
+
npm publish
|
|
253
|
+
```
|