@huydao/karrot 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/GUIDE.md +484 -0
  2. package/README.md +253 -0
  3. package/dist/assertions/assertion.d.ts +18 -0
  4. package/dist/assertions/assertion.js +198 -0
  5. package/dist/assertions/turn-eval.d.ts +22 -0
  6. package/dist/assertions/turn-eval.js +178 -0
  7. package/dist/executors/adapters/ag-ui-post.d.ts +55 -0
  8. package/dist/executors/adapters/ag-ui-post.js +703 -0
  9. package/dist/executors/adapters/ag-ui.d.ts +15 -0
  10. package/dist/executors/adapters/ag-ui.js +275 -0
  11. package/dist/executors/execute.d.ts +16 -0
  12. package/dist/executors/execute.js +145 -0
  13. package/dist/executors/executor.d.ts +37 -0
  14. package/dist/executors/executor.js +203 -0
  15. package/dist/executors/run-result.d.ts +33 -0
  16. package/dist/executors/run-result.js +22 -0
  17. package/dist/index.d.ts +12 -0
  18. package/dist/index.js +28 -0
  19. package/dist/prompts/turn-eval-system-prompt.md +68 -0
  20. package/dist/prompts/turn-message-gen-system-prompt.md +16 -0
  21. package/dist/reports/report.d.ts +68 -0
  22. package/dist/reports/report.js +366 -0
  23. package/dist/scenarios/generated-message.d.ts +15 -0
  24. package/dist/scenarios/generated-message.js +116 -0
  25. package/dist/scenarios/scenario-loader.d.ts +12 -0
  26. package/dist/scenarios/scenario-loader.js +103 -0
  27. package/dist/scenarios/scenario.d.ts +62 -0
  28. package/dist/scenarios/scenario.js +35 -0
  29. package/dist/utils/artifact-files.d.ts +3 -0
  30. package/dist/utils/artifact-files.js +22 -0
  31. package/dist/utils/config.d.ts +101 -0
  32. package/dist/utils/config.js +57 -0
  33. package/dist/utils/openai-eval.d.ts +5 -0
  34. package/dist/utils/openai-eval.js +54 -0
  35. package/package.json +146 -0
@@ -0,0 +1,18 @@
1
+ import { type AiTurnAssertion } from '../scenarios/scenario';
2
+ export type AssertionEvaluationResult = {
3
+ kind: 'assert' | 'aiAssert';
4
+ matcher: 'hasText' | 'toolcall' | 'hasContent' | 'notHasContent';
5
+ expected: string | string[];
6
+ description?: string;
7
+ passed: boolean;
8
+ reason: string;
9
+ };
10
+ type EvaluateTurnAssertionsOptions = {
11
+ assertions?: AiTurnAssertion[];
12
+ output: string;
13
+ toolCalls?: string[];
14
+ env?: NodeJS.ProcessEnv;
15
+ outputDirectory?: string;
16
+ };
17
+ export declare function evaluateTurnAssertions(options: EvaluateTurnAssertionsOptions): Promise<AssertionEvaluationResult[]>;
18
+ export {};
@@ -0,0 +1,198 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.evaluateTurnAssertions = evaluateTurnAssertions;
4
+ const openai_eval_1 = require("../utils/openai-eval");
5
+ function normalizeAssertion(assertion) {
6
+ if ('assert' in assertion) {
7
+ if ('hasText' in assertion.assert) {
8
+ return {
9
+ kind: 'assert',
10
+ matcher: 'hasText',
11
+ expected: assertion.assert.hasText,
12
+ description: assertion.description,
13
+ };
14
+ }
15
+ return {
16
+ kind: 'assert',
17
+ matcher: 'toolcall',
18
+ expected: assertion.assert.toolcall,
19
+ description: assertion.description,
20
+ };
21
+ }
22
+ if ('hasContent' in assertion.aiAssert) {
23
+ return {
24
+ kind: 'aiAssert',
25
+ matcher: 'hasContent',
26
+ expected: assertion.aiAssert.hasContent,
27
+ description: assertion.description,
28
+ };
29
+ }
30
+ return {
31
+ kind: 'aiAssert',
32
+ matcher: 'notHasContent',
33
+ expected: assertion.aiAssert.notHasContent,
34
+ description: assertion.description,
35
+ };
36
+ }
37
+ function formatExpectedValue(expected) {
38
+ if (Array.isArray(expected)) {
39
+ return `[${expected.join(', ')}]`;
40
+ }
41
+ return `"${expected}"`;
42
+ }
43
+ function buildAssertionReason(result) {
44
+ const prefix = result.description ? `${result.description}: ` : '';
45
+ return `${prefix}${result.matcher}(${formatExpectedValue(result.expected)})`;
46
+ }
47
+ function buildNormalizedAssertionReason(assertion) {
48
+ return buildAssertionReason({
49
+ kind: assertion.kind,
50
+ matcher: assertion.matcher,
51
+ expected: assertion.expected,
52
+ description: assertion.description,
53
+ passed: false,
54
+ reason: '',
55
+ });
56
+ }
57
+ function extractJsonObject(text) {
58
+ const startIndex = text.indexOf('{');
59
+ const endIndex = text.lastIndexOf('}');
60
+ if (startIndex === -1 || endIndex === -1 || endIndex <= startIndex) {
61
+ return undefined;
62
+ }
63
+ return text.slice(startIndex, endIndex + 1);
64
+ }
65
+ function parseAiAssertionResult(output, fallback) {
66
+ const jsonText = extractJsonObject(output);
67
+ if (!jsonText) {
68
+ return {
69
+ ...fallback,
70
+ passed: false,
71
+ reason: 'AI assertion did not return parseable JSON.',
72
+ };
73
+ }
74
+ try {
75
+ const parsed = JSON.parse(jsonText);
76
+ return {
77
+ ...fallback,
78
+ passed: parsed.pass === true,
79
+ reason: typeof parsed.reason === 'string' && parsed.reason.trim() ? parsed.reason.trim() : 'No reason provided.',
80
+ };
81
+ }
82
+ catch {
83
+ return {
84
+ ...fallback,
85
+ passed: false,
86
+ reason: 'AI assertion returned invalid JSON.',
87
+ };
88
+ }
89
+ }
90
+ async function evaluateAiAssertion(assertion, output, env) {
91
+ const prompt = [
92
+ 'You are evaluating an assistant response against an assertion.',
93
+ 'Judge semantically, not only by exact string match.',
94
+ `Assertion type: ${assertion.matcher}`,
95
+ `Assertion text: ${assertion.expected}`,
96
+ 'Assistant response:',
97
+ output,
98
+ 'Return only compact JSON with this shape:',
99
+ '{"pass":true,"reason":"short explanation"}',
100
+ ].join('\n');
101
+ const response = await fetch(`${(0, openai_eval_1.getOpenAiBaseUrl)(env)}/responses`, {
102
+ method: 'POST',
103
+ headers: {
104
+ Authorization: `Bearer ${(0, openai_eval_1.getRequiredOpenAiApiKey)(env)}`,
105
+ 'Content-Type': 'application/json',
106
+ },
107
+ body: JSON.stringify({
108
+ model: (0, openai_eval_1.getOpenAiEvalModel)(env),
109
+ input: [
110
+ {
111
+ role: 'developer',
112
+ content: 'Evaluate whether the assistant response satisfies the assertion. Return only JSON: {"pass":true|false,"reason":"short explanation"}',
113
+ },
114
+ {
115
+ role: 'user',
116
+ content: prompt,
117
+ },
118
+ ],
119
+ }),
120
+ });
121
+ if (!response.ok) {
122
+ throw new Error(`OpenAI aiAssert failed with HTTP ${response.status}: ${await response.text()}`);
123
+ }
124
+ const payload = (await response.json());
125
+ const responseText = (0, openai_eval_1.extractOutputText)(payload);
126
+ return parseAiAssertionResult(responseText, {
127
+ kind: assertion.kind,
128
+ matcher: assertion.matcher,
129
+ expected: assertion.expected,
130
+ description: assertion.description,
131
+ });
132
+ }
133
+ function evaluateToolCallAssertion(assertion, toolCalls) {
134
+ const expectedToolCalls = (Array.isArray(assertion.expected) ? assertion.expected : [])
135
+ .map((toolCall) => toolCall.trim())
136
+ .filter(Boolean);
137
+ const normalizedToolCallAssertion = {
138
+ ...assertion,
139
+ expected: expectedToolCalls,
140
+ };
141
+ const actualToolCalls = (toolCalls ?? []).map((toolCall) => toolCall.trim()).filter(Boolean);
142
+ const actualToolCallSet = new Set(actualToolCalls);
143
+ if (expectedToolCalls.length === 0) {
144
+ const passed = actualToolCalls.length === 0;
145
+ return {
146
+ kind: assertion.kind,
147
+ matcher: assertion.matcher,
148
+ expected: expectedToolCalls,
149
+ description: assertion.description,
150
+ passed,
151
+ reason: passed
152
+ ? `${buildNormalizedAssertionReason(normalizedToolCallAssertion)} passed.`
153
+ : `${buildNormalizedAssertionReason(normalizedToolCallAssertion)} failed. Observed: ${formatExpectedValue(actualToolCalls)}.`,
154
+ };
155
+ }
156
+ const missingToolCalls = expectedToolCalls.filter((toolCall) => !actualToolCallSet.has(toolCall));
157
+ const passed = missingToolCalls.length === 0;
158
+ return {
159
+ kind: assertion.kind,
160
+ matcher: assertion.matcher,
161
+ expected: expectedToolCalls,
162
+ description: assertion.description,
163
+ passed,
164
+ reason: passed
165
+ ? `${buildNormalizedAssertionReason(normalizedToolCallAssertion)} passed. Observed: ${formatExpectedValue(actualToolCalls)}.`
166
+ : `${buildNormalizedAssertionReason(normalizedToolCallAssertion)} failed. Missing: ${formatExpectedValue(missingToolCalls)}. Observed: ${formatExpectedValue(actualToolCalls)}.`,
167
+ };
168
+ }
169
+ async function evaluateTurnAssertions(options) {
170
+ if (!options.assertions?.length) {
171
+ return [];
172
+ }
173
+ const results = [];
174
+ for (const rawAssertion of options.assertions) {
175
+ const assertion = normalizeAssertion(rawAssertion);
176
+ if (assertion.kind === 'assert') {
177
+ if (assertion.matcher === 'toolcall') {
178
+ results.push(evaluateToolCallAssertion(assertion, options.toolCalls));
179
+ continue;
180
+ }
181
+ const expected = typeof assertion.expected === 'string' ? assertion.expected : assertion.expected.join(', ');
182
+ const passed = options.output.includes(expected);
183
+ results.push({
184
+ kind: assertion.kind,
185
+ matcher: assertion.matcher,
186
+ expected,
187
+ description: assertion.description,
188
+ passed,
189
+ reason: passed
190
+ ? `${buildNormalizedAssertionReason({ ...assertion, expected })} passed.`
191
+ : `${buildNormalizedAssertionReason({ ...assertion, expected })} failed.`,
192
+ });
193
+ continue;
194
+ }
195
+ results.push(await evaluateAiAssertion(assertion, options.output, options.env));
196
+ }
197
+ return results;
198
+ }
@@ -0,0 +1,22 @@
1
+ import { type AiTurnEvalDefinition } from '../scenarios/scenario';
2
+ export type TurnEvaluationResult = {
3
+ dimension: string;
4
+ score: number;
5
+ explanation: string;
6
+ suggestion: string;
7
+ };
8
+ type EvaluateTurnEvalsOptions = {
9
+ dimensions?: AiTurnEvalDefinition[];
10
+ scenarioId: string;
11
+ scenarioName: string;
12
+ turnLabel: string;
13
+ output: string;
14
+ env?: NodeJS.ProcessEnv;
15
+ history: Array<{
16
+ label: string;
17
+ message: string;
18
+ output?: string;
19
+ }>;
20
+ };
21
+ export declare function evaluateTurnEvals(options: EvaluateTurnEvalsOptions): Promise<TurnEvaluationResult[]>;
22
+ export {};
@@ -0,0 +1,178 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.evaluateTurnEvals = evaluateTurnEvals;
7
+ const node_path_1 = __importDefault(require("node:path"));
8
+ const openai_eval_1 = require("../utils/openai-eval");
9
+ const cachedSystemPrompts = new Map();
10
+ const DEFAULT_SYSTEM_PROMPT_PATH = node_path_1.default.resolve(__dirname, '..', 'prompts', 'turn-eval-system-prompt.md');
11
+ function resolveSystemPromptPath(env) {
12
+ const configuredPath = env?.AI_TURN_EVAL_SYSTEM_PROMPT_PATH?.trim();
13
+ if (!configuredPath) {
14
+ return DEFAULT_SYSTEM_PROMPT_PATH;
15
+ }
16
+ return node_path_1.default.isAbsolute(configuredPath) ? configuredPath : node_path_1.default.resolve(process.cwd(), configuredPath);
17
+ }
18
+ function resolvePromptDirectoryPath(env) {
19
+ const configuredPath = env?.AI_TURN_EVAL_PROMPT_DIRECTORY?.trim();
20
+ if (!configuredPath) {
21
+ return undefined;
22
+ }
23
+ return node_path_1.default.isAbsolute(configuredPath) ? configuredPath : node_path_1.default.resolve(process.cwd(), configuredPath);
24
+ }
25
+ async function readPromptDirectory(directoryPath) {
26
+ const cacheKey = `dir:${directoryPath}`;
27
+ const cachedPrompt = cachedSystemPrompts.get(cacheKey);
28
+ if (cachedPrompt) {
29
+ return cachedPrompt;
30
+ }
31
+ const { readdir, readFile } = await import('node:fs/promises');
32
+ const entries = await readdir(directoryPath, { withFileTypes: true });
33
+ const fileNames = entries
34
+ .filter((entry) => entry.isFile() && /\.(txt|md)$/i.test(entry.name))
35
+ .map((entry) => entry.name)
36
+ .sort((left, right) => left.localeCompare(right));
37
+ const fragments = await Promise.all(fileNames.map(async (fileName) => {
38
+ const filePath = node_path_1.default.join(directoryPath, fileName);
39
+ const content = (await readFile(filePath, 'utf8')).trim();
40
+ if (!content) {
41
+ return undefined;
42
+ }
43
+ return `File: ${fileName}\n${content}`;
44
+ }));
45
+ const joinedPrompt = fragments.filter(Boolean).join('\n\n');
46
+ cachedSystemPrompts.set(cacheKey, joinedPrompt);
47
+ return joinedPrompt;
48
+ }
49
+ async function readSystemPrompt(env) {
50
+ const systemPromptPath = resolveSystemPromptPath(env);
51
+ const promptDirectoryPath = resolvePromptDirectoryPath(env);
52
+ const cacheKey = `${systemPromptPath}::${promptDirectoryPath ?? ''}`;
53
+ const cachedCombinedPrompt = cachedSystemPrompts.get(cacheKey);
54
+ if (cachedCombinedPrompt) {
55
+ return cachedCombinedPrompt;
56
+ }
57
+ const { readFile } = await import('node:fs/promises');
58
+ const cachedSystemPrompt = cachedSystemPrompts.get(systemPromptPath);
59
+ const systemPrompt = cachedSystemPrompt ?? await readFile(systemPromptPath, 'utf8');
60
+ cachedSystemPrompts.set(systemPromptPath, systemPrompt);
61
+ if (!promptDirectoryPath) {
62
+ cachedSystemPrompts.set(cacheKey, systemPrompt);
63
+ return systemPrompt;
64
+ }
65
+ const directoryPrompt = await readPromptDirectory(promptDirectoryPath);
66
+ const combinedPrompt = directoryPrompt
67
+ ? `${systemPrompt.trim()}\n\nAdditional project-specific evaluation guidance:\n${directoryPrompt}`
68
+ : systemPrompt;
69
+ cachedSystemPrompts.set(cacheKey, combinedPrompt);
70
+ return combinedPrompt;
71
+ }
72
+ function extractJsonObject(text) {
73
+ const startIndex = text.indexOf('{');
74
+ const endIndex = text.lastIndexOf('}');
75
+ if (startIndex === -1 || endIndex === -1 || endIndex <= startIndex) {
76
+ return undefined;
77
+ }
78
+ return text.slice(startIndex, endIndex + 1);
79
+ }
80
+ function clampScore(value) {
81
+ const parsed = Number(value);
82
+ if (!Number.isFinite(parsed)) {
83
+ return 1;
84
+ }
85
+ return Math.max(1, Math.min(10, Math.round(parsed)));
86
+ }
87
+ function normalizeShortText(value, fallback) {
88
+ const text = typeof value === 'string' && value.trim() ? value.trim() : fallback;
89
+ const sentences = text.split(/(?<=[.!?])\s+/).filter(Boolean);
90
+ return sentences.slice(0, 3).join(' ').trim() || fallback;
91
+ }
92
+ function parseEvaluationResults(output, dimensions) {
93
+ const jsonText = extractJsonObject(output);
94
+ if (!jsonText) {
95
+ throw new Error('Turn evaluation did not return parseable JSON.');
96
+ }
97
+ const parsed = JSON.parse(jsonText);
98
+ const evaluations = Array.isArray(parsed.evaluations) ? parsed.evaluations : [];
99
+ return dimensions.map((dimension) => {
100
+ const matched = evaluations.find((evaluation) => typeof evaluation.dimension === 'string' && evaluation.dimension.trim().toLowerCase() === dimension.toLowerCase()) ?? {};
101
+ return {
102
+ dimension,
103
+ score: clampScore(matched.score),
104
+ explanation: normalizeShortText(matched.explanation, 'No explanation provided.'),
105
+ suggestion: normalizeShortText(matched.suggestion, 'No suggestion provided.'),
106
+ };
107
+ });
108
+ }
109
+ function normalizeEvalDefinitions(dimensions) {
110
+ return dimensions.map((definition) => typeof definition === 'string'
111
+ ? { dimension: definition }
112
+ : {
113
+ dimension: String(definition.dimension),
114
+ guidance: definition.guidance.trim(),
115
+ });
116
+ }
117
+ function buildHistoryText(history) {
118
+ return history
119
+ .map((entry, index) => [
120
+ `Turn ${index + 1}: ${entry.label}`,
121
+ `User message: ${entry.message}`,
122
+ `Assistant response: ${entry.output?.trim() || '[empty response]'}`,
123
+ ].join('\n'))
124
+ .join('\n\n');
125
+ }
126
+ async function evaluateTurnEvals(options) {
127
+ if (!options.dimensions?.length) {
128
+ return [];
129
+ }
130
+ const systemPrompt = await readSystemPrompt(options.env);
131
+ const dimensions = normalizeEvalDefinitions(options.dimensions);
132
+ const prompt = [
133
+ `Scenario ID: ${options.scenarioId}`,
134
+ `Scenario name: ${options.scenarioName}`,
135
+ `Current turn: ${options.turnLabel}`,
136
+ `Evaluation dimensions: ${dimensions.map((dimension) => dimension.dimension).join(', ')}`,
137
+ ...(dimensions.some((dimension) => dimension.guidance)
138
+ ? [
139
+ '',
140
+ 'Custom evaluation guidance:',
141
+ ...dimensions
142
+ .filter((dimension) => dimension.guidance)
143
+ .map((dimension) => `- ${dimension.dimension}: ${dimension.guidance}`),
144
+ ]
145
+ : []),
146
+ '',
147
+ 'Conversation history including the current turn:',
148
+ buildHistoryText(options.history),
149
+ '',
150
+ 'Return only compact JSON with this shape:',
151
+ '{"evaluations":[{"dimension":"correctness","score":8,"explanation":"short reason","suggestion":"short improvement"}]}',
152
+ ].join('\n');
153
+ const response = await fetch(`${(0, openai_eval_1.getOpenAiBaseUrl)(options.env)}/responses`, {
154
+ method: 'POST',
155
+ headers: {
156
+ Authorization: `Bearer ${(0, openai_eval_1.getRequiredOpenAiApiKey)(options.env)}`,
157
+ 'Content-Type': 'application/json',
158
+ },
159
+ body: JSON.stringify({
160
+ model: (0, openai_eval_1.getOpenAiEvalModel)(options.env),
161
+ input: [
162
+ {
163
+ role: 'developer',
164
+ content: systemPrompt,
165
+ },
166
+ {
167
+ role: 'user',
168
+ content: prompt,
169
+ },
170
+ ],
171
+ }),
172
+ });
173
+ if (!response.ok) {
174
+ throw new Error(`OpenAI turn eval failed with HTTP ${response.status}: ${await response.text()}`);
175
+ }
176
+ const payload = (await response.json());
177
+ return parseEvaluationResults((0, openai_eval_1.extractOutputText)(payload), dimensions.map((dimension) => dimension.dimension));
178
+ }
@@ -0,0 +1,55 @@
1
+ import { type MessageRunResult } from '../run-result';
2
+ export type RunAgUiPostMessageOptions = {
3
+ message: string;
4
+ outputDirectory: string;
5
+ threadId?: string;
6
+ processTimeoutMs?: number;
7
+ injectMessage?: boolean;
8
+ injectRunMetadata?: boolean;
9
+ run?: {
10
+ url: string;
11
+ headers?: Record<string, string>;
12
+ payload: Record<string, unknown>;
13
+ };
14
+ connect?: {
15
+ url: string;
16
+ headers?: Record<string, string>;
17
+ payload: Record<string, unknown>;
18
+ processTimeoutMs?: number;
19
+ };
20
+ observe?: {
21
+ type: 'http-poll';
22
+ status: {
23
+ url: string;
24
+ headers?: Record<string, string>;
25
+ statusPath: string;
26
+ successStatuses: string[];
27
+ failureStatuses?: string[];
28
+ };
29
+ progressEndpoints?: Array<{
30
+ url: string;
31
+ headers?: Record<string, string>;
32
+ itemsPath: string;
33
+ idPath?: string;
34
+ eventType?: string;
35
+ }>;
36
+ outputEndpoint?: {
37
+ url: string;
38
+ headers?: Record<string, string>;
39
+ textPath: string;
40
+ };
41
+ intervalMs?: number;
42
+ timeoutMs?: number;
43
+ };
44
+ completionCheck?: {
45
+ type: 'http-poll';
46
+ url: string;
47
+ headers?: Record<string, string>;
48
+ statusPath: string;
49
+ successStatuses: string[];
50
+ failureStatuses?: string[];
51
+ intervalMs?: number;
52
+ timeoutMs?: number;
53
+ };
54
+ };
55
+ export declare function runAgUiPostMessage(options: RunAgUiPostMessageOptions): Promise<MessageRunResult>;