@huydao/karrot 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/GUIDE.md +484 -0
  2. package/README.md +253 -0
  3. package/dist/assertions/assertion.d.ts +18 -0
  4. package/dist/assertions/assertion.js +198 -0
  5. package/dist/assertions/turn-eval.d.ts +22 -0
  6. package/dist/assertions/turn-eval.js +178 -0
  7. package/dist/executors/adapters/ag-ui-post.d.ts +55 -0
  8. package/dist/executors/adapters/ag-ui-post.js +703 -0
  9. package/dist/executors/adapters/ag-ui.d.ts +15 -0
  10. package/dist/executors/adapters/ag-ui.js +275 -0
  11. package/dist/executors/execute.d.ts +16 -0
  12. package/dist/executors/execute.js +145 -0
  13. package/dist/executors/executor.d.ts +37 -0
  14. package/dist/executors/executor.js +203 -0
  15. package/dist/executors/run-result.d.ts +33 -0
  16. package/dist/executors/run-result.js +22 -0
  17. package/dist/index.d.ts +12 -0
  18. package/dist/index.js +28 -0
  19. package/dist/prompts/turn-eval-system-prompt.md +68 -0
  20. package/dist/prompts/turn-message-gen-system-prompt.md +16 -0
  21. package/dist/reports/report.d.ts +68 -0
  22. package/dist/reports/report.js +366 -0
  23. package/dist/scenarios/generated-message.d.ts +15 -0
  24. package/dist/scenarios/generated-message.js +116 -0
  25. package/dist/scenarios/scenario-loader.d.ts +12 -0
  26. package/dist/scenarios/scenario-loader.js +103 -0
  27. package/dist/scenarios/scenario.d.ts +62 -0
  28. package/dist/scenarios/scenario.js +35 -0
  29. package/dist/utils/artifact-files.d.ts +3 -0
  30. package/dist/utils/artifact-files.js +22 -0
  31. package/dist/utils/config.d.ts +101 -0
  32. package/dist/utils/config.js +57 -0
  33. package/dist/utils/openai-eval.d.ts +5 -0
  34. package/dist/utils/openai-eval.js +54 -0
  35. package/package.json +146 -0
@@ -0,0 +1,203 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.runScenario = runScenario;
4
+ const assertion_1 = require("../assertions/assertion");
5
+ const generated_message_1 = require("../scenarios/generated-message");
6
+ const turn_eval_1 = require("../assertions/turn-eval");
7
+ const report_1 = require("../reports/report");
8
+ const run_result_1 = require("./run-result");
9
+ function readPositiveTimeoutMs(value) {
10
+ const parsed = Number(value);
11
+ return Number.isFinite(parsed) && parsed > 0 ? parsed : undefined;
12
+ }
13
+ function resolveTurnProcessTimeoutMs(options) {
14
+ const envOverrideMs = readPositiveTimeoutMs(options.env.AI_TURN_TIMEOUT_MS);
15
+ const requestedMs = typeof envOverrideMs === 'number'
16
+ ? Math.max(options.turn.processTimeoutMs ?? 0, envOverrideMs)
17
+ : options.turn.processTimeoutMs;
18
+ if (typeof options.remainingMs !== 'number') {
19
+ return requestedMs;
20
+ }
21
+ return Math.max(1, Math.min(requestedMs ?? options.remainingMs, options.remainingMs));
22
+ }
23
+ async function runSingleScenario(scenario, context, env, outputDirectory, deadlineAt, messageRunner, initialThreadId) {
24
+ const result = {
25
+ id: scenario.id,
26
+ name: scenario.name,
27
+ status: 'PASS',
28
+ turns: [],
29
+ metrics: {},
30
+ };
31
+ let threadId = initialThreadId;
32
+ console.log(`\n=== ${scenario.id} — ${scenario.name} ===`);
33
+ for (const turn of scenario.turns) {
34
+ const remainingMs = typeof deadlineAt === 'number' ? deadlineAt - Date.now() : undefined;
35
+ if (typeof remainingMs === 'number' && remainingMs <= 0) {
36
+ throw new Error(`Scenario ${scenario.id} exceeded the maximum execution time.`);
37
+ }
38
+ const message = await (0, generated_message_1.resolveTurnMessage)({
39
+ turn,
40
+ context,
41
+ env,
42
+ scenarioId: scenario.id,
43
+ scenarioName: scenario.name,
44
+ history: result.turns.map((completedTurn) => ({
45
+ label: completedTurn.label,
46
+ message: completedTurn.message,
47
+ output: completedTurn.output,
48
+ })),
49
+ });
50
+ console.log(`\n--- ${scenario.id} ${turn.label} ---`);
51
+ const turnEnv = {
52
+ ...env,
53
+ ...(turn.idleTimeoutMs ? { IDLE_TIMEOUT: String(turn.idleTimeoutMs) } : {}),
54
+ };
55
+ try {
56
+ if (!messageRunner) {
57
+ throw new Error('runScenario requires a messageRunner. Provide a transport adapter from the AUT layer.');
58
+ }
59
+ const run = await messageRunner({
60
+ scenario,
61
+ turn,
62
+ context,
63
+ message,
64
+ env: turnEnv,
65
+ outputDirectory,
66
+ threadId,
67
+ processTimeoutMs: resolveTurnProcessTimeoutMs({
68
+ env: turnEnv,
69
+ turn,
70
+ remainingMs,
71
+ }),
72
+ });
73
+ await turn.onComplete?.({ context, output: run.output });
74
+ threadId = run.threadId;
75
+ result.threadId = run.threadId;
76
+ const turnResult = {
77
+ label: turn.label,
78
+ message,
79
+ evalDimensions: turn.eval
80
+ ? turn.eval.map((dimension) => (typeof dimension === 'string' ? dimension : String(dimension.dimension)))
81
+ : undefined,
82
+ threadId: run.threadId,
83
+ outputPath: run.outputPath,
84
+ output: run.output,
85
+ note: run.note,
86
+ toolCallCount: run.toolCallCount,
87
+ toolCalls: run.toolCalls,
88
+ metrics: run.metrics,
89
+ };
90
+ let turnRecorded = false;
91
+ try {
92
+ const assertionResults = await (0, assertion_1.evaluateTurnAssertions)({
93
+ assertions: turn.assertions,
94
+ output: run.output,
95
+ toolCalls: run.toolCalls,
96
+ env: turnEnv,
97
+ outputDirectory,
98
+ });
99
+ const failedAssertions = assertionResults.filter((assertion) => !assertion.passed);
100
+ const assertionFailureNote = failedAssertions.length > 0
101
+ ? failedAssertions.map((assertion) => assertion.reason).join(' ')
102
+ : undefined;
103
+ const evaluationResults = await (0, turn_eval_1.evaluateTurnEvals)({
104
+ dimensions: turn.eval,
105
+ scenarioId: scenario.id,
106
+ scenarioName: scenario.name,
107
+ turnLabel: turn.label,
108
+ output: run.output,
109
+ env: turnEnv,
110
+ history: [
111
+ ...result.turns.map((completedTurn) => ({
112
+ label: completedTurn.label,
113
+ message: completedTurn.message,
114
+ output: completedTurn.output,
115
+ })),
116
+ {
117
+ label: turn.label,
118
+ message,
119
+ output: run.output,
120
+ },
121
+ ],
122
+ });
123
+ turnResult.assertionResults = assertionResults;
124
+ turnResult.evaluationResults = evaluationResults;
125
+ turnResult.note = [run.note, assertionFailureNote].filter(Boolean).join(' ') || undefined;
126
+ result.turns.push(turnResult);
127
+ turnRecorded = true;
128
+ if (assertionFailureNote) {
129
+ throw new Error(assertionFailureNote);
130
+ }
131
+ }
132
+ catch (error) {
133
+ turnResult.note = [
134
+ turnResult.note,
135
+ error instanceof Error ? error.message : String(error),
136
+ ]
137
+ .filter(Boolean)
138
+ .join(' ');
139
+ if (!turnRecorded) {
140
+ result.turns.push(turnResult);
141
+ }
142
+ throw error;
143
+ }
144
+ }
145
+ catch (error) {
146
+ if (error instanceof run_result_1.MessageRunError) {
147
+ result.threadId = error.threadId ?? result.threadId;
148
+ result.turns.push({
149
+ label: turn.label,
150
+ message,
151
+ evalDimensions: turn.eval
152
+ ? turn.eval.map((dimension) => (typeof dimension === 'string' ? dimension : String(dimension.dimension)))
153
+ : undefined,
154
+ threadId: error.threadId,
155
+ outputPath: error.outputPath,
156
+ output: error.output,
157
+ note: error.message,
158
+ toolCallCount: error.toolCallCount,
159
+ toolCalls: error.toolCalls,
160
+ metrics: error.metrics,
161
+ });
162
+ }
163
+ result.status = 'FAIL';
164
+ result.note = error instanceof Error ? error.message : String(error);
165
+ throw new report_1.ScenarioExecutionError((0, report_1.finalizeScenarioResult)(result));
166
+ }
167
+ }
168
+ const turnNotes = result.turns.map((turn) => turn.note).filter(Boolean);
169
+ if (turnNotes.length > 0) {
170
+ result.note = turnNotes.join(' ');
171
+ }
172
+ return (0, report_1.finalizeScenarioResult)(result);
173
+ }
174
+ async function runScenario(scenario, options) {
175
+ const scenarios = Array.isArray(scenario) ? scenario : [scenario];
176
+ const results = [];
177
+ const shouldStopOnFailure = options.stopOnFailure ?? true;
178
+ const deadlineAt = typeof options.maxDurationMs === 'number' ? Date.now() + options.maxDurationMs : undefined;
179
+ for (const currentScenario of scenarios) {
180
+ try {
181
+ results.push(await runSingleScenario(currentScenario, options.context, options.env, options.outputDirectory, deadlineAt, options.messageRunner, options.initialThreadId));
182
+ }
183
+ catch (error) {
184
+ if (error instanceof report_1.ScenarioExecutionError) {
185
+ results.push(error.result);
186
+ }
187
+ else {
188
+ results.push({
189
+ id: currentScenario.id,
190
+ name: currentScenario.name,
191
+ status: 'FAIL',
192
+ note: error instanceof Error ? error.message : String(error),
193
+ turns: [],
194
+ metrics: {},
195
+ });
196
+ }
197
+ if (shouldStopOnFailure) {
198
+ break;
199
+ }
200
+ }
201
+ }
202
+ return Array.isArray(scenario) ? results : results[0];
203
+ }
@@ -0,0 +1,33 @@
1
+ export type TimingMetrics = {
2
+ ttfToolSeconds?: number;
3
+ ttfTextSeconds?: number;
4
+ totalSeconds?: number;
5
+ protocolUsedKb?: number;
6
+ protocolTotalKb?: number;
7
+ efficiencyPercent?: number;
8
+ };
9
+ export type MessageRunResult = {
10
+ output: string;
11
+ threadId: string;
12
+ outputPath: string;
13
+ note?: string;
14
+ toolCallCount: number;
15
+ toolCalls: string[];
16
+ metrics: TimingMetrics;
17
+ };
18
+ export declare class MessageRunError extends Error {
19
+ threadId?: string;
20
+ outputPath?: string;
21
+ output: string;
22
+ metrics: TimingMetrics;
23
+ toolCallCount: number;
24
+ toolCalls: string[];
25
+ constructor(message: string, options?: {
26
+ threadId?: string;
27
+ outputPath?: string;
28
+ output?: string;
29
+ metrics?: TimingMetrics;
30
+ toolCallCount?: number;
31
+ toolCalls?: string[];
32
+ });
33
+ }
@@ -0,0 +1,22 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.MessageRunError = void 0;
4
+ class MessageRunError extends Error {
5
+ threadId;
6
+ outputPath;
7
+ output;
8
+ metrics;
9
+ toolCallCount;
10
+ toolCalls;
11
+ constructor(message, options = {}) {
12
+ super(message);
13
+ this.name = 'MessageRunError';
14
+ this.threadId = options.threadId;
15
+ this.outputPath = options.outputPath;
16
+ this.output = options.output ?? '';
17
+ this.metrics = options.metrics ?? {};
18
+ this.toolCallCount = options.toolCallCount ?? 0;
19
+ this.toolCalls = options.toolCalls ?? [];
20
+ }
21
+ }
22
+ exports.MessageRunError = MessageRunError;
@@ -0,0 +1,12 @@
1
+ export * from './assertions/assertion';
2
+ export * from './assertions/turn-eval';
3
+ export * from './executors/execute';
4
+ export * from './executors/executor';
5
+ export * from './executors/run-result';
6
+ export * from './reports/report';
7
+ export * from './scenarios/generated-message';
8
+ export * from './scenarios/scenario';
9
+ export * from './scenarios/scenario-loader';
10
+ export * from './utils/artifact-files';
11
+ export * from './utils/config';
12
+ export * from './utils/openai-eval';
package/dist/index.js ADDED
@@ -0,0 +1,28 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __exportStar = (this && this.__exportStar) || function(m, exports) {
14
+ for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
15
+ };
16
+ Object.defineProperty(exports, "__esModule", { value: true });
17
+ __exportStar(require("./assertions/assertion"), exports);
18
+ __exportStar(require("./assertions/turn-eval"), exports);
19
+ __exportStar(require("./executors/execute"), exports);
20
+ __exportStar(require("./executors/executor"), exports);
21
+ __exportStar(require("./executors/run-result"), exports);
22
+ __exportStar(require("./reports/report"), exports);
23
+ __exportStar(require("./scenarios/generated-message"), exports);
24
+ __exportStar(require("./scenarios/scenario"), exports);
25
+ __exportStar(require("./scenarios/scenario-loader"), exports);
26
+ __exportStar(require("./utils/artifact-files"), exports);
27
+ __exportStar(require("./utils/config"), exports);
28
+ __exportStar(require("./utils/openai-eval"), exports);
@@ -0,0 +1,68 @@
1
+ You are evaluating an AI assistant turn inside a multi-turn test scenario.
2
+
3
+ Your job:
4
+ - Score only the assistant response for the current turn, while using the full conversation history as context.
5
+ - Evaluate only the requested dimensions.
6
+ - Score each dimension as an integer from 1 to 10, where 10 is best.
7
+ - Keep each explanation concise, no more than 3 sentences.
8
+ - Keep each suggestion concise and actionable, no more than 2 sentences.
9
+
10
+ General scoring guidance:
11
+ - 9 to 10: excellent, only minor or no issues
12
+ - 7 to 8: strong, useful response with some gaps
13
+ - 5 to 6: acceptable but noticeably incomplete, weak, or uneven
14
+ - 3 to 4: poor, major issues reduce usefulness
15
+ - 1 to 2: very poor, misleading, unusable, or severely off-target
16
+
17
+ Core dimension guidance:
18
+ - correctness:
19
+ Judge factual accuracy, internal consistency, instruction-following, and whether the response avoids invented or contradictory details.
20
+ High score: requirements are interpreted correctly and outputs are logically valid.
21
+ Low score: contains wrong facts, wrong logic, contradictions, or unsupported assumptions.
22
+
23
+ - coverage:
24
+ Judge whether the response addresses the important parts of the current request and relevant prior context.
25
+ High score: covers main cases, edge cases, constraints, and expected outputs at the right level.
26
+ Low score: omits important scenarios, ignores constraints, or responds too narrowly.
27
+
28
+ - helpfulness:
29
+ Judge whether the response is useful for the user's goal, easy to act on, and presented clearly.
30
+ High score: well-structured, practical, readable, and directly usable.
31
+ Low score: vague, hard to use, rambling, or missing next-step value.
32
+
33
+ Common optional dimensions:
34
+ - clarity:
35
+ Judge readability, organization, and whether the wording is easy to understand.
36
+
37
+ - completeness:
38
+ Judge whether the response feels sufficiently finished for the request, without major missing parts.
39
+
40
+ - conciseness:
41
+ Judge whether the response is appropriately brief without losing needed substance.
42
+
43
+ - relevance:
44
+ Judge whether the response stays on-topic and avoids unnecessary or distracting content.
45
+
46
+ - actionability:
47
+ Judge whether the user can directly use the response to proceed, implement, or decide next steps.
48
+
49
+ - structure:
50
+ Judge whether the response is organized into a form that is easy to scan and review.
51
+
52
+ - consistency:
53
+ Judge whether the response aligns with earlier turns and remains internally coherent.
54
+
55
+ - safety:
56
+ Judge whether the response avoids risky, misleading, or inappropriate guidance for the context.
57
+
58
+ Evaluation habits:
59
+ - Use the conversation history only as context. Score the current assistant response itself.
60
+ - Do not reward style if the answer is wrong.
61
+ - Do not punish brevity if the request is simple and the answer is still sufficient.
62
+ - If a dimension is not strongly applicable, still score it based on the closest reasonable interpretation.
63
+
64
+ Output rules:
65
+ - Return only valid JSON.
66
+ - Use this exact shape:
67
+ {"evaluations":[{"dimension":"correctness","score":8,"explanation":"short reason","suggestion":"short improvement"}]}
68
+ - Include one object for every requested dimension.
@@ -0,0 +1,16 @@
1
+ You generate exactly one user chat message for a scenario test runner.
2
+
3
+ Rules:
4
+ - Return only the user message text.
5
+ - Do not include explanations, labels, quotes, JSON, or markdown fences.
6
+ - Keep the message natural, specific, and useful.
7
+ - Preserve the important intent, constraints, and domain details from the provided inputs.
8
+ - If explicit guidance is provided, follow it closely.
9
+ - If prior conversation history exists, make the message follow naturally from that history.
10
+ - If source content is provided, convert it into a realistic user message instead of copying it mechanically when possible.
11
+ - Keep the message concise unless the source content requires more detail.
12
+
13
+ Quality bar:
14
+ - The message should sound like something a real user would send next.
15
+ - The message should move the conversation forward.
16
+ - The message must not introduce unrelated facts.
@@ -0,0 +1,68 @@
1
+ import { type AssertionEvaluationResult } from '../assertions/assertion';
2
+ import { type TurnEvaluationResult } from '../assertions/turn-eval';
3
+ import { type TimingMetrics } from '../executors/run-result';
4
+ export type TurnRunResult = {
5
+ label: string;
6
+ message: string;
7
+ evalDimensions?: string[];
8
+ threadId?: string;
9
+ outputPath?: string;
10
+ output?: string;
11
+ note?: string;
12
+ toolCallCount: number;
13
+ toolCalls?: string[];
14
+ metrics: TimingMetrics;
15
+ assertionResults?: AssertionEvaluationResult[];
16
+ evaluationResults?: TurnEvaluationResult[];
17
+ };
18
+ export type ScenarioRunResult = {
19
+ id: string;
20
+ name: string;
21
+ status: 'PASS' | 'FAIL' | 'SKIP';
22
+ note?: string;
23
+ threadId?: string;
24
+ turns: TurnRunResult[];
25
+ metrics: TimingMetrics;
26
+ };
27
+ type ScenarioRunSlackRow = {
28
+ id: string;
29
+ name: string;
30
+ status: 'PASS' | 'FAIL' | 'SKIP';
31
+ ttfTool: string;
32
+ ttfText: string;
33
+ total: string;
34
+ efficiency: string;
35
+ note?: string;
36
+ };
37
+ type ScenarioRuntimeSnapshot = {
38
+ agentUrl: string;
39
+ agentId: string;
40
+ wsUrl: string;
41
+ wsTopic: string;
42
+ accountId: string;
43
+ projectId: string;
44
+ appBaseUrl: string;
45
+ };
46
+ type WriteScenarioRunReportOptions = {
47
+ outputDirectory: string;
48
+ runtime: ScenarioRuntimeSnapshot;
49
+ environment: string;
50
+ projectName: string;
51
+ scenarioContext: Record<string, unknown>;
52
+ results: ScenarioRunResult[];
53
+ };
54
+ type ScenarioRunReportPaths = {
55
+ jsonPath: string;
56
+ htmlPath: string;
57
+ };
58
+ export declare class ScenarioExecutionError extends Error {
59
+ result: ScenarioRunResult;
60
+ constructor(result: ScenarioRunResult);
61
+ }
62
+ export declare function finalizeScenarioResult(result: ScenarioRunResult): ScenarioRunResult;
63
+ export declare function getScenarioRunStatus(results: ScenarioRunResult[]): 'PASS' | 'FAIL';
64
+ export declare function buildScenarioSlackRows(results: ScenarioRunResult[]): ScenarioRunSlackRow[];
65
+ export declare function writeScenarioRunReport<TContext extends Record<string, unknown>>(options: Omit<WriteScenarioRunReportOptions, 'scenarioContext'> & {
66
+ scenarioContext: TContext;
67
+ }): Promise<ScenarioRunReportPaths>;
68
+ export {};