@netlify/axis 0.3.1 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -5
- package/dist/baselines/{diff.d.ts → compare.d.ts} +3 -3
- package/dist/baselines/compare.d.ts.map +1 -0
- package/dist/baselines/{diff.js → compare.js} +2 -2
- package/dist/baselines/compare.js.map +1 -0
- package/dist/baselines/index.d.ts +1 -1
- package/dist/baselines/index.d.ts.map +1 -1
- package/dist/baselines/index.js +1 -1
- package/dist/baselines/index.js.map +1 -1
- package/dist/cli.js +16 -12
- package/dist/cli.js.map +1 -1
- package/dist/config/validator.js +4 -4
- package/dist/index.d.ts +4 -2
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +3 -2
- package/dist/index.js.map +1 -1
- package/dist/report-ui/index.html +49 -46
- package/dist/reports/writer.d.ts +28 -7
- package/dist/reports/writer.d.ts.map +1 -1
- package/dist/reports/writer.js +69 -30
- package/dist/reports/writer.js.map +1 -1
- package/dist/runner/runner.js +1 -0
- package/dist/runner/runner.js.map +1 -1
- package/dist/scoring/deep-eval.d.ts +27 -7
- package/dist/scoring/deep-eval.d.ts.map +1 -1
- package/dist/scoring/deep-eval.js +242 -147
- package/dist/scoring/deep-eval.js.map +1 -1
- package/dist/scoring/goal-achievement.d.ts.map +1 -1
- package/dist/scoring/goal-achievement.js +26 -102
- package/dist/scoring/goal-achievement.js.map +1 -1
- package/dist/scoring/index.d.ts +1 -1
- package/dist/scoring/index.d.ts.map +1 -1
- package/dist/scoring/index.js +12 -7
- package/dist/scoring/index.js.map +1 -1
- package/dist/scoring/judge.d.ts +17 -0
- package/dist/scoring/judge.d.ts.map +1 -0
- package/dist/scoring/judge.js +43 -0
- package/dist/scoring/judge.js.map +1 -0
- package/dist/scoring/prompt-templates.d.ts +50 -0
- package/dist/scoring/prompt-templates.d.ts.map +1 -0
- package/dist/scoring/prompt-templates.js +251 -0
- package/dist/scoring/prompt-templates.js.map +1 -0
- package/dist/types/baseline.d.ts +4 -4
- package/dist/types/baseline.d.ts.map +1 -1
- package/dist/types/config.d.ts +2 -2
- package/dist/types/output.d.ts +2 -0
- package/dist/types/output.d.ts.map +1 -1
- package/dist/types/output.js.map +1 -1
- package/dist/types/scoring.d.ts +25 -26
- package/dist/types/scoring.d.ts.map +1 -1
- package/dist/ui/format.d.ts +2 -2
- package/dist/ui/format.d.ts.map +1 -1
- package/dist/ui/format.js +1 -1
- package/dist/ui/format.js.map +1 -1
- package/package.json +1 -1
- package/dist/baselines/diff.d.ts.map +0 -1
- package/dist/baselines/diff.js.map +0 -1
- package/dist/scoring/triage.d.ts +0 -15
- package/dist/scoring/triage.d.ts.map +0 -1
- package/dist/scoring/triage.js +0 -204
- package/dist/scoring/triage.js.map +0 -1
|
@@ -1,8 +1,6 @@
|
|
|
1
|
-
import
|
|
2
|
-
import * as os from "node:os";
|
|
3
|
-
import * as path from "node:path";
|
|
4
|
-
import { getAdapter } from "../adapters/registry.js";
|
|
1
|
+
import { callJudge } from "./judge.js";
|
|
5
2
|
import { parseJsonFromText } from "./parse-json.js";
|
|
3
|
+
import { getPromptTemplates, interpolate } from "./prompt-templates.js";
|
|
6
4
|
export async function scoreGoalAchievement(result, normalizedEntries) {
|
|
7
5
|
const { rubric } = result;
|
|
8
6
|
const { result: finalResult } = result.output;
|
|
@@ -16,7 +14,10 @@ export async function scoreGoalAchievement(result, normalizedEntries) {
|
|
|
16
14
|
}
|
|
17
15
|
async function scoreStringRubric(runResult, rubric, entries, finalResult) {
|
|
18
16
|
const prompt = buildStringRubricPrompt(runResult, entries, finalResult, rubric);
|
|
19
|
-
const responseText = await callJudge(runResult, prompt
|
|
17
|
+
const responseText = await callJudge(runResult, prompt, {
|
|
18
|
+
scenarioKey: "__judge__",
|
|
19
|
+
scenarioName: "AXIS Judge",
|
|
20
|
+
});
|
|
20
21
|
const parsed = parseJsonFromText(responseText);
|
|
21
22
|
if (!parsed || typeof parsed.score !== "number") {
|
|
22
23
|
return {
|
|
@@ -46,115 +47,38 @@ async function scoreStringRubric(runResult, rubric, entries, finalResult) {
|
|
|
46
47
|
}
|
|
47
48
|
async function scoreArrayRubric(runResult, rubric, entries, finalResult) {
|
|
48
49
|
const prompt = buildArrayRubricPrompt(runResult, entries, finalResult, rubric);
|
|
49
|
-
const responseText = await callJudge(runResult, prompt
|
|
50
|
+
const responseText = await callJudge(runResult, prompt, {
|
|
51
|
+
scenarioKey: "__judge__",
|
|
52
|
+
scenarioName: "AXIS Judge",
|
|
53
|
+
});
|
|
50
54
|
const criteria = parseArrayJudgeResponse(responseText, rubric);
|
|
51
55
|
const score = computeWeightedScore(criteria);
|
|
52
56
|
return { score, criteria };
|
|
53
57
|
}
|
|
54
|
-
async function callJudge(runResult, prompt) {
|
|
55
|
-
const adapter = getAdapter(runResult.agentConfig.adapter);
|
|
56
|
-
const workspace = fs.mkdtempSync(path.join(os.tmpdir(), "axis-judge-"));
|
|
57
|
-
try {
|
|
58
|
-
const output = await adapter.run({
|
|
59
|
-
prompt,
|
|
60
|
-
config: runResult.agentConfig,
|
|
61
|
-
scenario: {
|
|
62
|
-
key: "__judge__",
|
|
63
|
-
name: "AXIS Judge",
|
|
64
|
-
prompt,
|
|
65
|
-
rubric: [],
|
|
66
|
-
},
|
|
67
|
-
workingDirectory: workspace,
|
|
68
|
-
});
|
|
69
|
-
return output.result ?? "";
|
|
70
|
-
}
|
|
71
|
-
finally {
|
|
72
|
-
try {
|
|
73
|
-
fs.rmSync(workspace, { recursive: true, force: true });
|
|
74
|
-
}
|
|
75
|
-
catch {
|
|
76
|
-
/* ignore */
|
|
77
|
-
}
|
|
78
|
-
}
|
|
79
|
-
}
|
|
80
58
|
/** Max characters for the condensed transcript section. */
|
|
81
59
|
const MAX_TRANSCRIPT_CHARS = 50_000;
|
|
82
60
|
/** Max characters per individual transcript entry. */
|
|
83
61
|
const MAX_ENTRY_CHARS = 2_000;
|
|
84
62
|
function buildStringRubricPrompt(result, entries, finalResult, rubric) {
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
---
|
|
95
|
-
|
|
96
|
-
AGENT TRANSCRIPT (condensed):
|
|
97
|
-
${formatTranscriptForJudge(entries)}
|
|
98
|
-
|
|
99
|
-
---
|
|
100
|
-
|
|
101
|
-
AGENT'S FINAL RESULT:
|
|
102
|
-
${finalResult ?? "(no final result)"}
|
|
103
|
-
|
|
104
|
-
---
|
|
105
|
-
|
|
106
|
-
RUBRIC:
|
|
107
|
-
${rubric}
|
|
108
|
-
|
|
109
|
-
---
|
|
110
|
-
|
|
111
|
-
INSTRUCTIONS:
|
|
112
|
-
1. Review the transcript to understand what the agent did.
|
|
113
|
-
2. Where possible, independently verify the results — visit URLs, check endpoints, confirm that the claimed outcomes actually exist. Do not trust the transcript alone.
|
|
114
|
-
3. Score based on what you can verify, not just what the agent claims.
|
|
115
|
-
|
|
116
|
-
When done, respond with ONLY valid JSON on its own line:
|
|
117
|
-
{"score": <0-10>, "rationale": "<1-2 sentence explanation>"}
|
|
118
|
-
|
|
119
|
-
Score guide: 0 = not met at all, 5 = partially met, 10 = fully met.`;
|
|
63
|
+
const { goal_string_rubric } = getPromptTemplates();
|
|
64
|
+
return interpolate(goal_string_rubric.template, {
|
|
65
|
+
scenarioName: result.scenarioName,
|
|
66
|
+
prompt: getOriginalPrompt(result),
|
|
67
|
+
transcript: formatTranscriptForJudge(entries),
|
|
68
|
+
finalResult: finalResult ?? "(no final result)",
|
|
69
|
+
rubric,
|
|
70
|
+
});
|
|
120
71
|
}
|
|
121
72
|
function buildArrayRubricPrompt(result, entries, finalResult, rubric) {
|
|
122
73
|
const rubricText = rubric.map((r, i) => `${i}. "${r.check}" (weight: ${r.weight})`).join("\n");
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
---
|
|
133
|
-
|
|
134
|
-
AGENT TRANSCRIPT (condensed):
|
|
135
|
-
${formatTranscriptForJudge(entries)}
|
|
136
|
-
|
|
137
|
-
---
|
|
138
|
-
|
|
139
|
-
AGENT'S FINAL RESULT:
|
|
140
|
-
${finalResult ?? "(no final result)"}
|
|
141
|
-
|
|
142
|
-
---
|
|
143
|
-
|
|
144
|
-
RUBRIC CRITERIA:
|
|
145
|
-
${rubricText}
|
|
146
|
-
|
|
147
|
-
---
|
|
148
|
-
|
|
149
|
-
INSTRUCTIONS:
|
|
150
|
-
1. Review the transcript to understand what the agent did.
|
|
151
|
-
2. Where possible, independently verify the results — visit URLs, check endpoints, confirm that the claimed outcomes actually exist. Do not trust the transcript alone.
|
|
152
|
-
3. For each criterion, provide a score from 0 to 10 and a brief rationale.
|
|
153
|
-
|
|
154
|
-
Score guide: 0 = not met at all, 5 = partially met, 10 = fully met.
|
|
155
|
-
|
|
156
|
-
When done, respond with ONLY valid JSON on its own line:
|
|
157
|
-
{"grades": [{"criterion_index": 0, "score": <0-10>, "rationale": "<string>"}, ...]}`;
|
|
74
|
+
const { goal_array_rubric } = getPromptTemplates();
|
|
75
|
+
return interpolate(goal_array_rubric.template, {
|
|
76
|
+
scenarioName: result.scenarioName,
|
|
77
|
+
prompt: getOriginalPrompt(result),
|
|
78
|
+
transcript: formatTranscriptForJudge(entries),
|
|
79
|
+
finalResult: finalResult ?? "(no final result)",
|
|
80
|
+
rubricText,
|
|
81
|
+
});
|
|
158
82
|
}
|
|
159
83
|
function getOriginalPrompt(result) {
|
|
160
84
|
return result.prompt;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"goal-achievement.js","sourceRoot":"","sources":["../../src/scoring/goal-achievement.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"goal-achievement.js","sourceRoot":"","sources":["../../src/scoring/goal-achievement.ts"],"names":[],"mappings":"AAIA,OAAO,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AACvC,OAAO,EAAE,iBAAiB,EAAE,MAAM,iBAAiB,CAAC;AACpD,OAAO,EAAE,kBAAkB,EAAE,WAAW,EAAE,MAAM,uBAAuB,CAAC;AAExE,MAAM,CAAC,KAAK,UAAU,oBAAoB,CACxC,MAAiB,EACjB,iBAAoC;IAEpC,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,CAAC;IAC1B,MAAM,EAAE,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,CAAC,MAAM,CAAC;IAE9C,IAAI,OAAO,MAAM,KAAK,QAAQ,EAAE,CAAC;QAC/B,OAAO,iBAAiB,CAAC,MAAM,EAAE,MAAM,EAAE,iBAAiB,EAAE,WAAW,CAAC,CAAC;IAC3E,CAAC;IAED,IAAI,CAAC,MAAM,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACnC,OAAO,EAAE,KAAK,EAAE,CAAC,EAAE,QAAQ,EAAE,EAAE,EAAE,CAAC;IACpC,CAAC;IAED,OAAO,gBAAgB,CAAC,MAAM,EAAE,MAAM,EAAE,iBAAiB,EAAE,WAAW,CAAC,CAAC;AAC1E,CAAC;AAED,KAAK,UAAU,iBAAiB,CAC9B,SAAoB,EACpB,MAAc,EACd,OAA0B,EAC1B,WAA0B;IAE1B,MAAM,MAAM,GAAG,uBAAuB,CAAC,SAAS,EAAE,OAAO,EAAE,WAAW,EAAE,MAAM,CAAC,CAAC;IAChF,MAAM,YAAY,GAAG,MAAM,SAAS,CAAC,SAAS,EAAE,MAAM,EAAE;QACtD,WAAW,EAAE,WAAW;QACxB,YAAY,EAAE,YAAY;KAC3B,CAAC,CAAC;IAEH,MAAM,MAAM,GAAG,iBAAiB,CAAC,YAAY,CAAC,CAAC;IAC/C,IAAI,CAAC,MAAM,IAAI,OAAO,MAAM,CAAC,KAAK,KAAK,QAAQ,EAAE,CAAC;QAChD,OAAO;YACL,KAAK,EAAE,CAAC;YACR,QAAQ,EAAE;gBACR;oBACE,KAAK,EAAE,MAAM;oBACb,MAAM,EAAE,GAAG;oBACX,KAAK,EAAE,CAAC;oBACR,SAAS,EAAE,gCAAgC;iBAC5C;aACF;SACF,CAAC;IACJ,CAAC;IAED,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;IAClE,OAAO;QACL,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,KAAK,GAAG,EAAE,CAAC,GAAG,GAAG,CAAC;QACrC,QAAQ,EAAE;YACR;gBACE,KAAK,EAAE,MAAM;gBACb,MAAM,EAAE,GAAG;gBACX,KAAK;gBACL,SAAS,EAAG,MAAM,CAAC,SAAoB,IAAI,EAAE;aAC9C;SACF;KACF,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,gBAAgB,CAC7B,SAAoB,EACpB,MAAyB,EACzB,OAA0B,EAC1B,WAA0B;IAE1B,MAAM,MAAM,GAAG,sBAAsB,CAAC,SAAS,EAAE,OAAO,EAAE,WAAW,EAAE,MAAM,CAAC,CAAC;IAC/E,MAAM,YAAY,GAAG,MAAM,SAAS,CAAC,SAAS,EAAE,MAAM,EAAE;QACtD,WAAW,EAAE,WAAW;QACxB,YAAY,EAAE,YAAY;KAC3B,CAAC,CAAC;IAEH,MAAM,QAAQ,GAAG,uBAAuB,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC;IAC/D,MAAM,KAAK,GAAG,oBAAoB,CAAC,QAAQ,CAAC,CAAC;IAE7C,OAAO,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;AAC7B,CAAC;AAGD,2DAA2D;AAC3D,MAAM,oBAAoB,GAAG,MAAM,CAAC;AAEpC,sDAAsD;AACtD,MAAM,eAAe,GAAG,KAAK,CAAC;AAE9B,SAAS,uBAAuB,CAC9B,MAAiB,EACjB,OAA0B,EAC1B,WAA0B,EAC1B,MAAc;IAEd,MAAM,EAAE,kBAAkB,EAAE,GAAG,kBAAkB,EAAE,CAAC;IAEpD,OAAO,WAAW,CAAC,kBAAkB,CAAC,QAAQ,EAAE;QAC9C,YAAY,EAAE,MAAM,CAAC,YAAY;QACjC,MAAM,EAAE,iBAAiB,CAAC,MAAM,CAAC;QACjC,UAAU,EAAE,wBAAwB,CAAC,OAAO,CAAC;QAC7C,WAAW,EAAE,WAAW,IAAI,mBAAmB;QAC/C,MAAM;KACP,CAAC,CAAC;AACL,CAAC;AAED,SAAS,sBAAsB,CAC7B,MAAiB,EACjB,OAA0B,EAC1B,WAA0B,EAC1B,MAAyB;IAEzB,MAAM,UAAU,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,KAAK,cAAc,CAAC,CAAC,MAAO,GAAG,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAChG,MAAM,EAAE,iBAAiB,EAAE,GAAG,kBAAkB,EAAE,CAAC;IAEnD,OAAO,WAAW,CAAC,iBAAiB,CAAC,QAAQ,EAAE;QAC7C,YAAY,EAAE,MAAM,CAAC,YAAY;QACjC,MAAM,EAAE,iBAAiB,CAAC,MAAM,CAAC;QACjC,UAAU,EAAE,wBAAwB,CAAC,OAAO,CAAC;QAC7C,WAAW,EAAE,WAAW,IAAI,mBAAmB;QAC/C,UAAU;KACX,CAAC,CAAC;AACL,CAAC;AAED,SAAS,iBAAiB,CAAC,MAAiB;IAC1C,OAAO,MAAM,CAAC,MAAM,CAAC;AACvB,CAAC;AAED;;GAEG;AACH,SAAS,wBAAwB,CAAC,OAA0B;IAC1D,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,oBAAoB,CAAC;IAEtD,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,IAAI,UAAU,GAAG,CAAC,CAAC;IAEnB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACxC,MAAM,SAAS,GAAG,aAAa,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;QAEnD,IAAI,UAAU,GAAG,SAAS,CAAC,MAAM,GAAG,oBAAoB,EAAE,CAAC;YACzD,MAAM,SAAS,GAAG,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC;YACrC,KAAK,CAAC,IAAI,CAAC,UAAU,SAAS,sCAAsC,CAAC,CAAC;YACtE,MAAM;QACR,CAAC;QAED,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QACtB,UAAU,IAAI,SAAS,CAAC,MAAM,CAAC;IACjC,CAAC;IAED,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED;;GAEG;AACH,SAAS,aAAa,CAAC,KAAsB,EAAE,KAAa;IAC1D,QAAQ,KAAK,CAAC,IAAI,EAAE,CAAC;QACnB,KAAK,WAAW;YACd,OAAO,IAAI,KAAK,gBAAgB,QAAQ,CAAC,KAAK,CAAC,IAAI,IAAI,WAAW,EAAE,eAAe,CAAC,EAAE,CAAC;QACzF,KAAK,UAAU,CAAC,CAAC,CAAC;YAChB,MAAM,IAAI,GAAG,KAAK,CAAC,QAAQ,IAAI,SAAS,CAAC;YACzC,MAAM,KAAK,GAAG,KAAK,CAAC,gBAAgB,CAAC,CAAC,CAAC,IAAI,QAAQ,CAAC,KAAK,CAAC,gBAAgB,EAAE,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;YACzF,OAAO,IAAI,KAAK,eAAe,IAAI,GAAG,KAAK,EAAE,CAAC;QAChD,CAAC;QACD,KAAK,aAAa;YAChB,OAAO,IAAI,KAAK,kBAAkB,QAAQ,CAAC,KAAK,CAAC,cAAc,IAAI,aAAa,EAAE,eAAe,CAAC,EAAE,CAAC;QACvG,KAAK,OAAO;YACV,OAAO,IAAI,KAAK,YAAY,QAAQ,CAAC,KAAK,CAAC,YAAY,IAAI,KAAK,CAAC,IAAI,IAAI,iBAAiB,EAAE,eAAe,CAAC,EAAE,CAAC;QACjH,KAAK,QAAQ;YACX,OAAO,IAAI,KAAK,aAAa,QAAQ,CAAC,KAAK,CAAC,IAAI,IAAI,cAAc,EAAE,GAAG,CAAC,EAAE,CAAC;QAC7E,KAAK,MAAM;YACT,OAAO,IAAI,KAAK,WAAW,QAAQ,CAAC,KAAK,CAAC,IAAI,IAAI,cAAc,EAAE,eAAe,CAAC,EAAE,CAAC;QACvF;YACE,OAAO,IAAI,KAAK,KAAK,KAAK,CAAC,IAAI,KAAK,QAAQ,CAAC,KAAK,CAAC,IAAI,IAAI,cAAc,EAAE,GAAG,CAAC,EAAE,CAAC;IACtF,CAAC;AACH,CAAC;AAED,SAAS,QAAQ,CAAC,IAAY,EAAE,MAAc;IAC5C,IAAI,IAAI,CAAC,MAAM,IAAI,MAAM;QAAE,OAAO,IAAI,CAAC;IACvC,OAAO,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,MAAM,CAAC,GAAG,KAAK,CAAC;AACvC,CAAC;AAED,SAAS,uBAAuB,CAAC,YAAoB,EAAE,MAAyB;IAC9E,MAAM,MAAM,GAAG,iBAAiB,CAAC,YAAY,CAAC,CAAC;IAC/C,IAAI,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,MAAM,CAAC,EAAE,CAAC;QAC7C,OAAO,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACxB,KAAK,EAAE,CAAC,CAAC,KAAK;YACd,MAAM,EAAE,CAAC,CAAC,MAAO;YACjB,KAAK,EAAE,CAAC;YACR,SAAS,EAAE,gCAAgC;SAC5C,CAAC,CAAC,CAAC;IACN,CAAC;IAED,MAAM,MAAM,GAAG,MAAM,CAAC,MAIpB,CAAC;IAEH,OAAO,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QACzB,MAAM,KAAK,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,eAAe,KAAK,CAAC,CAAC,CAAC;QAC1D,OAAO;YACL,KAAK,EAAE,CAAC,CAAC,KAAK;YACd,MAAM,EAAE,CAAC,CAAC,MAAO;YACjB,KAAK,EAAE,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YACrE,SAAS,EAAE,KAAK,EAAE,SAAS,IAAI,mBAAmB;SACnD,CAAC;IACJ,CAAC,CAAC,CAAC;AACL,CAAC;AAED,SAAS,oBAAoB,CAAC,QAA0B;IACtD,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IAEpC,MAAM,WAAW,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IACnE,IAAI,WAAW,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IAEhC,MAAM,WAAW,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,KAAK,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IAEpF,OAAO,IAAI,CAAC,KAAK,CAAC,CAAC,WAAW,GAAG,WAAW,CAAC,GAAG,GAAG,CAAC,CAAC;AACvD,CAAC"}
|
package/dist/scoring/index.d.ts
CHANGED
|
@@ -2,7 +2,7 @@ import type { RunOutput, RunResult } from "../types/output.js";
|
|
|
2
2
|
import type { ScoredOutput, ScoredRunResult, ScoringOptions } from "../types/scoring.js";
|
|
3
3
|
/**
|
|
4
4
|
* Score a single run result using the interaction-based evaluation pipeline:
|
|
5
|
-
* normalize → sparse index →
|
|
5
|
+
* normalize → sparse index → write raw data → (deep eval || goal achievement) → category score → composite
|
|
6
6
|
*/
|
|
7
7
|
export declare function scoreRunResult(result: RunResult, options?: ScoringOptions): Promise<ScoredRunResult>;
|
|
8
8
|
/**
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/scoring/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,oBAAoB,CAAC;AAE/D,OAAO,KAAK,EAAE,YAAY,EAAE,eAAe,EAAe,cAAc,EAAE,MAAM,qBAAqB,CAAC;AAgBtG;;;GAGG;AACH,wBAAsB,cAAc,CAAC,MAAM,EAAE,SAAS,EAAE,OAAO,CAAC,EAAE,cAAc,GAAG,OAAO,CAAC,eAAe,CAAC,
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/scoring/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,oBAAoB,CAAC;AAE/D,OAAO,KAAK,EAAE,YAAY,EAAE,eAAe,EAAe,cAAc,EAAE,MAAM,qBAAqB,CAAC;AAgBtG;;;GAGG;AACH,wBAAsB,cAAc,CAAC,MAAM,EAAE,SAAS,EAAE,OAAO,CAAC,EAAE,cAAc,GAAG,OAAO,CAAC,eAAe,CAAC,CAuF1G;AAED;;GAEG;AACH,wBAAgB,iBAAiB,CAAC,SAAS,EAAE,SAAS,EAAE,aAAa,EAAE,eAAe,EAAE,GAAG,YAAY,CAmBtG;AAED;;GAEG;AACH,wBAAsB,YAAY,CAAC,SAAS,EAAE,SAAS,EAAE,OAAO,CAAC,EAAE,cAAc,GAAG,OAAO,CAAC,YAAY,CAAC,CAIxG"}
|
package/dist/scoring/index.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { normalizeTranscript, toTranscriptAnalysis } from "../transcript/normalize.js";
|
|
2
|
+
import { writeScenarioRawData } from "../reports/writer.js";
|
|
2
3
|
import { scoreGoalAchievement } from "./goal-achievement.js";
|
|
3
4
|
import { buildSparseIndex, populateInteractionContent } from "./sparse-index.js";
|
|
4
|
-
import { runTriage } from "./triage.js";
|
|
5
5
|
import { runDeepEval } from "./deep-eval.js";
|
|
6
6
|
import { computeCategoryScore } from "./category-score.js";
|
|
7
7
|
import { computeComposite } from "./composite.js";
|
|
@@ -13,7 +13,7 @@ const DEFAULT_WEIGHTS = {
|
|
|
13
13
|
};
|
|
14
14
|
/**
|
|
15
15
|
* Score a single run result using the interaction-based evaluation pipeline:
|
|
16
|
-
* normalize → sparse index →
|
|
16
|
+
* normalize → sparse index → write raw data → (deep eval || goal achievement) → category score → composite
|
|
17
17
|
*/
|
|
18
18
|
export async function scoreRunResult(result, options) {
|
|
19
19
|
const weights = options?.weights ?? DEFAULT_WEIGHTS;
|
|
@@ -26,13 +26,18 @@ export async function scoreRunResult(result, options) {
|
|
|
26
26
|
// Step 2: Build sparse index (deterministic) and populate content for reports
|
|
27
27
|
const sparseIndex = buildSparseIndex(normalized);
|
|
28
28
|
populateInteractionContent(sparseIndex, normalized);
|
|
29
|
-
// Step 3
|
|
30
|
-
|
|
31
|
-
|
|
29
|
+
// Step 3: Write raw data to report dir so LLM judges can read it
|
|
30
|
+
if (options?.reportDir) {
|
|
31
|
+
writeScenarioRawData(options.reportDir, result, sparseIndex);
|
|
32
|
+
}
|
|
33
|
+
// Step 4: Deep eval + goal achievement in parallel
|
|
34
|
+
const [deepEvalResult, goalAchievement] = await Promise.all([
|
|
35
|
+
runDeepEval(result, sparseIndex, normalized, {
|
|
36
|
+
weights,
|
|
37
|
+
reportDir: options?.reportDir,
|
|
38
|
+
}),
|
|
32
39
|
scoreGoalAchievement(result, normalized.entries),
|
|
33
40
|
]);
|
|
34
|
-
// Step 4 continued: Deep eval needs triage results
|
|
35
|
-
const deepEvalResult = await runDeepEval(result, sparseIndex, triageResult, normalized);
|
|
36
41
|
// Step 5: Compute category scores
|
|
37
42
|
const necessityMap = new Map(deepEvalResult.necessity.map((n) => [n.category, n]));
|
|
38
43
|
const defaultNecessity = (category) => ({
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/scoring/index.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,mBAAmB,EAAE,oBAAoB,EAAE,MAAM,4BAA4B,CAAC;AACvF,OAAO,EAAE,oBAAoB,EAAE,MAAM,
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/scoring/index.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,mBAAmB,EAAE,oBAAoB,EAAE,MAAM,4BAA4B,CAAC;AACvF,OAAO,EAAE,oBAAoB,EAAE,MAAM,sBAAsB,CAAC;AAC5D,OAAO,EAAE,oBAAoB,EAAE,MAAM,uBAAuB,CAAC;AAC7D,OAAO,EAAE,gBAAgB,EAAE,0BAA0B,EAAE,MAAM,mBAAmB,CAAC;AACjF,OAAO,EAAE,WAAW,EAAE,MAAM,gBAAgB,CAAC;AAC7C,OAAO,EAAE,oBAAoB,EAAE,MAAM,qBAAqB,CAAC;AAC3D,OAAO,EAAE,gBAAgB,EAAE,MAAM,gBAAgB,CAAC;AAElD,MAAM,eAAe,GAAmB;IACtC,gBAAgB,EAAE,GAAG;IACrB,WAAW,EAAE,GAAG;IAChB,OAAO,EAAE,GAAG;IACZ,KAAK,EAAE,GAAG;CACX,CAAC;AAEF;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAAC,MAAiB,EAAE,OAAwB;IAC9E,MAAM,OAAO,GAAG,OAAO,EAAE,OAAO,IAAI,eAAe,CAAC;IACpD,MAAM,MAAM,GAAG,OAAO,EAAE,MAAM,CAAC;IAC/B,MAAM,KAAK,GAAG,GAAG,MAAM,CAAC,WAAW,KAAK,MAAM,CAAC,SAAS,GAAG,CAAC;IAE5D,MAAM,EAAE,OAAO,EAAE,CAAC,WAAW,KAAK,KAAK,CAAC,CAAC;IACzC,OAAO,EAAE,UAAU,EAAE,CAAC,MAAM,CAAC,WAAW,EAAE,MAAM,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC;IAErE,qDAAqD;IACrD,MAAM,UAAU,GAAG,mBAAmB,CAAC,MAAM,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC;IAEjE,8EAA8E;IAC9E,MAAM,WAAW,GAAG,gBAAgB,CAAC,UAAU,CAAC,CAAC;IACjD,0BAA0B,CAAC,WAAW,EAAE,UAAU,CAAC,CAAC;IAEpD,iEAAiE;IACjE,IAAI,OAAO,EAAE,SAAS,EAAE,CAAC;QACvB,oBAAoB,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,EAAE,WAAW,CAAC,CAAC;IAC/D,CAAC;IAED,mDAAmD;IACnD,MAAM,CAAC,cAAc,EAAE,eAAe,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;QAC1D,WAAW,CAAC,MAAM,EAAE,WAAW,EAAE,UAAU,EAAE;YAC3C,OAAO;YACP,SAAS,EAAE,OAAO,EAAE,SAAS;SAC9B,CAAC;QACF,oBAAoB,CAAC,MAAM,EAAE,UAAU,CAAC,OAAO,CAAC;KACjD,CAAC,CAAC;IAEH,kCAAkC;IAClC,MAAM,YAAY,GAAG,IAAI,GAAG,CAAC,cAAc,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;IACnF,MAAM,gBAAgB,GAAG,CAAC,QAA6C,EAAE,EAAE,CAAC,CAAC;QAC3E,QAAQ;QACR,KAAK,EAAE,GAAG;QACV,cAAc,EAAE,EAAc;QAC9B,SAAS,EAAE,SAAS;KACrB,CAAC,CAAC;IAEH,MAAM,WAAW,GAAG,oBAAoB,CACtC,aAAa,EACb,cAAc,CAAC,MAAM,EACrB,YAAY,CAAC,GAAG,CAAC,aAAa,CAAC,IAAI,gBAAgB,CAAC,aAAa,CAAC,EAClE,WAAW,CAAC,YAAY,CACzB,CAAC;IAEF,MAAM,OAAO,GAAG,oBAAoB,CAClC,SAAS,EACT,cAAc,CAAC,MAAM,EACrB,YAAY,CAAC,GAAG,CAAC,SAAS,CAAC,IAAI,gBAAgB,CAAC,SAAS,CAAC,EAC1D,WAAW,CAAC,YAAY,CACzB,CAAC;IAEF,MAAM,KAAK,GAAG,oBAAoB,CAChC,OAAO,EACP,cAAc,CAAC,MAAM,EACrB,YAAY,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,gBAAgB,CAAC,OAAO,CAAC,EACtD,WAAW,CAAC,YAAY,CACzB,CAAC;IAEF,uCAAuC;IACvC,MAAM,SAAS,GAAG,gBAAgB,CAAC,eAAe,CAAC,KAAK,EAAE,WAAW,CAAC,KAAK,EAAE,OAAO,CAAC,KAAK,EAAE,KAAK,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;IAElH,MAAM,KAAK,GAAgB;QACzB,SAAS;QACT,eAAe;QACf,WAAW;QACX,OAAO;QACP,KAAK;QACL,OAAO;QACP,WAAW;KACZ,CAAC;IAEF,OAAO,EAAE,UAAU,EAAE,CAAC,MAAM,CAAC,WAAW,EAAE,MAAM,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IAEpE,sEAAsE;IACtE,MAAM,CAAC,MAAM,CAAC,kBAAkB,GAAG,oBAAoB,CAAC,UAAU,CAAC,CAAC;IAEpE,OAAO;QACL,WAAW,EAAE,MAAM,CAAC,WAAW;QAC/B,YAAY,EAAE,MAAM,CAAC,YAAY;QACjC,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,MAAM,EAAE,MAAM,CAAC,MAAM;QACrB,MAAM,EAAE,MAAM,CAAC,MAAM;QACrB,WAAW,EAAE,MAAM,CAAC,WAAW;QAC/B,MAAM,EAAE,MAAM,CAAC,MAAM;QACrB,KAAK;KACN,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,iBAAiB,CAAC,SAAoB,EAAE,aAAgC;IACtF,MAAM,gBAAgB,GAAG,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,QAAQ,CAAC,QAAQ,KAAK,CAAC,CAAC,CAAC;IACvF,MAAM,gBAAgB,GACpB,gBAAgB,CAAC,MAAM,GAAG,CAAC;QACzB,CAAC,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC,CAAC,GAAG,gBAAgB,CAAC,MAAM;QAC3F,CAAC,CAAC,CAAC,CAAC;IAER,OAAO;QACL,OAAO,EAAE,SAAS,CAAC,OAAO;QAC1B,SAAS,EAAE,SAAS,CAAC,SAAS;QAC9B,UAAU,EAAE,SAAS,CAAC,UAAU;QAChC,OAAO,EAAE,aAAa;QACtB,OAAO,EAAE;YACP,KAAK,EAAE,SAAS,CAAC,OAAO,CAAC,KAAK;YAC9B,SAAS,EAAE,SAAS,CAAC,OAAO,CAAC,SAAS;YACtC,MAAM,EAAE,SAAS,CAAC,OAAO,CAAC,MAAM;YAChC,gBAAgB,EAAE,IAAI,CAAC,KAAK,CAAC,gBAAgB,CAAC;SAC/C;KACF,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,YAAY,CAAC,SAAoB,EAAE,OAAwB;IAC/E,MAAM,aAAa,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,SAAS,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,cAAc,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC;IAElG,OAAO,iBAAiB,CAAC,SAAS,EAAE,aAAa,CAAC,CAAC;AACrD,CAAC"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import type { RunResult } from "../types/output.js";
|
|
2
|
+
export interface JudgeCallOptions {
|
|
3
|
+
/** Scenario key for the judge run (e.g., "__env_eval__"). */
|
|
4
|
+
scenarioKey: string;
|
|
5
|
+
/** Human-readable name for the judge run. */
|
|
6
|
+
scenarioName: string;
|
|
7
|
+
}
|
|
8
|
+
/**
|
|
9
|
+
* Call an LLM judge using the same adapter as the test run.
|
|
10
|
+
*
|
|
11
|
+
* Uses the agent's original workspace when available so the judge can
|
|
12
|
+
* independently verify the agent's actual work (files created, endpoints
|
|
13
|
+
* deployed, etc.). Falls back to a disposable temp directory only when
|
|
14
|
+
* no workspace is set (e.g. programmatic API usage without the runner).
|
|
15
|
+
*/
|
|
16
|
+
export declare function callJudge(runResult: RunResult, prompt: string, options: JudgeCallOptions): Promise<string>;
|
|
17
|
+
//# sourceMappingURL=judge.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"judge.d.ts","sourceRoot":"","sources":["../../src/scoring/judge.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,oBAAoB,CAAC;AAEpD,MAAM,WAAW,gBAAgB;IAC/B,6DAA6D;IAC7D,WAAW,EAAE,MAAM,CAAC;IACpB,6CAA6C;IAC7C,YAAY,EAAE,MAAM,CAAC;CACtB;AAED;;;;;;;GAOG;AACH,wBAAsB,SAAS,CAC7B,SAAS,EAAE,SAAS,EACpB,MAAM,EAAE,MAAM,EACd,OAAO,EAAE,gBAAgB,GACxB,OAAO,CAAC,MAAM,CAAC,CA6BjB"}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import * as fs from "node:fs";
|
|
2
|
+
import * as os from "node:os";
|
|
3
|
+
import * as path from "node:path";
|
|
4
|
+
import { getAdapter } from "../adapters/registry.js";
|
|
5
|
+
/**
|
|
6
|
+
* Call an LLM judge using the same adapter as the test run.
|
|
7
|
+
*
|
|
8
|
+
* Uses the agent's original workspace when available so the judge can
|
|
9
|
+
* independently verify the agent's actual work (files created, endpoints
|
|
10
|
+
* deployed, etc.). Falls back to a disposable temp directory only when
|
|
11
|
+
* no workspace is set (e.g. programmatic API usage without the runner).
|
|
12
|
+
*/
|
|
13
|
+
export async function callJudge(runResult, prompt, options) {
|
|
14
|
+
const adapter = getAdapter(runResult.agentConfig.adapter);
|
|
15
|
+
const originalWorkspace = runResult.workingDirectory;
|
|
16
|
+
const workspace = originalWorkspace ?? fs.mkdtempSync(path.join(os.tmpdir(), `axis-${options.scenarioKey}-`));
|
|
17
|
+
const shouldCleanup = !originalWorkspace;
|
|
18
|
+
try {
|
|
19
|
+
const output = await adapter.run({
|
|
20
|
+
prompt,
|
|
21
|
+
config: runResult.agentConfig,
|
|
22
|
+
scenario: {
|
|
23
|
+
key: options.scenarioKey,
|
|
24
|
+
name: options.scenarioName,
|
|
25
|
+
prompt,
|
|
26
|
+
rubric: [],
|
|
27
|
+
},
|
|
28
|
+
workingDirectory: workspace,
|
|
29
|
+
});
|
|
30
|
+
return output.result ?? "";
|
|
31
|
+
}
|
|
32
|
+
finally {
|
|
33
|
+
if (shouldCleanup) {
|
|
34
|
+
try {
|
|
35
|
+
fs.rmSync(workspace, { recursive: true, force: true });
|
|
36
|
+
}
|
|
37
|
+
catch {
|
|
38
|
+
/* ignore */
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
//# sourceMappingURL=judge.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"judge.js","sourceRoot":"","sources":["../../src/scoring/judge.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,EAAE,UAAU,EAAE,MAAM,yBAAyB,CAAC;AAUrD;;;;;;;GAOG;AACH,MAAM,CAAC,KAAK,UAAU,SAAS,CAC7B,SAAoB,EACpB,MAAc,EACd,OAAyB;IAEzB,MAAM,OAAO,GAAG,UAAU,CAAC,SAAS,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;IAC1D,MAAM,iBAAiB,GAAG,SAAS,CAAC,gBAAgB,CAAC;IAErD,MAAM,SAAS,GAAG,iBAAiB,IAAI,EAAE,CAAC,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,MAAM,EAAE,EAAE,QAAQ,OAAO,CAAC,WAAW,GAAG,CAAC,CAAC,CAAC;IAC9G,MAAM,aAAa,GAAG,CAAC,iBAAiB,CAAC;IAEzC,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;YAC/B,MAAM;YACN,MAAM,EAAE,SAAS,CAAC,WAAW;YAC7B,QAAQ,EAAE;gBACR,GAAG,EAAE,OAAO,CAAC,WAAW;gBACxB,IAAI,EAAE,OAAO,CAAC,YAAY;gBAC1B,MAAM;gBACN,MAAM,EAAE,EAAE;aACX;YACD,gBAAgB,EAAE,SAAS;SAC5B,CAAC,CAAC;QACH,OAAO,MAAM,CAAC,MAAM,IAAI,EAAE,CAAC;IAC7B,CAAC;YAAS,CAAC;QACT,IAAI,aAAa,EAAE,CAAC;YAClB,IAAI,CAAC;gBACH,EAAE,CAAC,MAAM,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;YACzD,CAAC;YAAC,MAAM,CAAC;gBACP,YAAY;YACd,CAAC;QACH,CAAC;IACH,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Declarative prompt templates for the AXIS scoring pipeline.
|
|
3
|
+
*
|
|
4
|
+
* Each template uses `{{variable}}` placeholders that are substituted at
|
|
5
|
+
* runtime via `interpolate()`. The raw templates (with placeholders intact)
|
|
6
|
+
* are exposed via `getPromptTemplates()` so documentation UIs can display
|
|
7
|
+
* the exact prompts used during scoring.
|
|
8
|
+
*/
|
|
9
|
+
/** Describes a single placeholder variable within a prompt template. */
|
|
10
|
+
export interface PromptVariable {
|
|
11
|
+
/** The placeholder name as it appears in `{{name}}`. */
|
|
12
|
+
name: string;
|
|
13
|
+
/** Human-readable description of what this variable contains. */
|
|
14
|
+
description: string;
|
|
15
|
+
/** Type hint for documentation purposes — not enforced at runtime. */
|
|
16
|
+
type: "string" | "number" | "text";
|
|
17
|
+
/** Whether the variable may legitimately resolve to an empty string. */
|
|
18
|
+
optional?: boolean;
|
|
19
|
+
}
|
|
20
|
+
/** A self-describing prompt template for one stage of the scoring pipeline. */
|
|
21
|
+
export interface PromptTemplate {
|
|
22
|
+
/** Unique identifier (also the record key returned by `getPromptTemplates`). */
|
|
23
|
+
name: string;
|
|
24
|
+
/** Human-readable description of what the prompt does. */
|
|
25
|
+
description: string;
|
|
26
|
+
/** Which scoring pipeline stage uses this prompt. */
|
|
27
|
+
stage: "deep_eval" | "goal_achievement";
|
|
28
|
+
/** The template string with `{{variable}}` placeholders. */
|
|
29
|
+
template: string;
|
|
30
|
+
/** Metadata about each placeholder the template accepts. */
|
|
31
|
+
variables: PromptVariable[];
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Replace `{{key}}` placeholders in `template` with values from `vars`.
|
|
35
|
+
*
|
|
36
|
+
* Throws if any placeholder in the template has no corresponding key in
|
|
37
|
+
* `vars`. Numbers are coerced to strings via `String()`.
|
|
38
|
+
*/
|
|
39
|
+
export declare function interpolate(template: string, vars: Record<string, string | number>): string;
|
|
40
|
+
/** Per-category evaluation guidance for focused judge prompts. */
|
|
41
|
+
export declare const CATEGORY_GUIDANCE: Record<string, string>;
|
|
42
|
+
/**
|
|
43
|
+
* Return all scoring prompt templates keyed by name.
|
|
44
|
+
*
|
|
45
|
+
* The templates contain `{{variable}}` placeholders — use `interpolate()`
|
|
46
|
+
* to substitute runtime values, or display the raw template text in
|
|
47
|
+
* documentation UIs.
|
|
48
|
+
*/
|
|
49
|
+
export declare function getPromptTemplates(): Record<string, PromptTemplate>;
|
|
50
|
+
//# sourceMappingURL=prompt-templates.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"prompt-templates.d.ts","sourceRoot":"","sources":["../../src/scoring/prompt-templates.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAMH,wEAAwE;AACxE,MAAM,WAAW,cAAc;IAC7B,wDAAwD;IACxD,IAAI,EAAE,MAAM,CAAC;IACb,iEAAiE;IACjE,WAAW,EAAE,MAAM,CAAC;IACpB,sEAAsE;IACtE,IAAI,EAAE,QAAQ,GAAG,QAAQ,GAAG,MAAM,CAAC;IACnC,wEAAwE;IACxE,QAAQ,CAAC,EAAE,OAAO,CAAC;CACpB;AAED,+EAA+E;AAC/E,MAAM,WAAW,cAAc;IAC7B,gFAAgF;IAChF,IAAI,EAAE,MAAM,CAAC;IACb,0DAA0D;IAC1D,WAAW,EAAE,MAAM,CAAC;IACpB,qDAAqD;IACrD,KAAK,EAAE,WAAW,GAAG,kBAAkB,CAAC;IACxC,4DAA4D;IAC5D,QAAQ,EAAE,MAAM,CAAC;IACjB,4DAA4D;IAC5D,SAAS,EAAE,cAAc,EAAE,CAAC;CAC7B;AAMD;;;;;GAKG;AACH,wBAAgB,WAAW,CAAC,QAAQ,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,CAAC,GAAG,MAAM,CAO3F;AAMD,kEAAkE;AAClE,eAAO,MAAM,iBAAiB,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CA2BpD,CAAC;AA4LF;;;;;;GAMG;AACH,wBAAgB,kBAAkB,IAAI,MAAM,CAAC,MAAM,EAAE,cAAc,CAAC,CAMnE"}
|
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Declarative prompt templates for the AXIS scoring pipeline.
|
|
3
|
+
*
|
|
4
|
+
* Each template uses `{{variable}}` placeholders that are substituted at
|
|
5
|
+
* runtime via `interpolate()`. The raw templates (with placeholders intact)
|
|
6
|
+
* are exposed via `getPromptTemplates()` so documentation UIs can display
|
|
7
|
+
* the exact prompts used during scoring.
|
|
8
|
+
*/
|
|
9
|
+
// ---------------------------------------------------------------------------
|
|
10
|
+
// Interpolation
|
|
11
|
+
// ---------------------------------------------------------------------------
|
|
12
|
+
/**
|
|
13
|
+
* Replace `{{key}}` placeholders in `template` with values from `vars`.
|
|
14
|
+
*
|
|
15
|
+
* Throws if any placeholder in the template has no corresponding key in
|
|
16
|
+
* `vars`. Numbers are coerced to strings via `String()`.
|
|
17
|
+
*/
|
|
18
|
+
export function interpolate(template, vars) {
|
|
19
|
+
return template.replace(/\{\{(\w+)\}\}/g, (_match, key) => {
|
|
20
|
+
if (!(key in vars)) {
|
|
21
|
+
throw new Error(`Missing template variable: {{${key}}}`);
|
|
22
|
+
}
|
|
23
|
+
return String(vars[key]);
|
|
24
|
+
});
|
|
25
|
+
}
|
|
26
|
+
// ---------------------------------------------------------------------------
|
|
27
|
+
// Template definitions
|
|
28
|
+
// ---------------------------------------------------------------------------
|
|
29
|
+
/** Per-category evaluation guidance for focused judge prompts. */
|
|
30
|
+
export const CATEGORY_GUIDANCE = {
|
|
31
|
+
environment: `You are evaluating ENVIRONMENT interactions — file system operations, shell commands, code edits, and local workspace manipulation.
|
|
32
|
+
|
|
33
|
+
Key considerations:
|
|
34
|
+
- File reads/writes: Was the operation necessary? Was it right-sized (not reading entire files when a section would suffice)?
|
|
35
|
+
- Shell commands: Did they succeed? Were they idempotent or did they cause side effects?
|
|
36
|
+
- Code edits: Were they precise (targeted edits vs. rewriting entire files)?
|
|
37
|
+
- Error recovery: Did the agent handle file-not-found, permission errors, or failed commands well?
|
|
38
|
+
- Workspace hygiene: Did the agent clean up temp files, avoid polluting the workspace?`,
|
|
39
|
+
service: `You are evaluating SERVICE interactions — API calls, web fetches, external service requests, and network operations.
|
|
40
|
+
|
|
41
|
+
Key considerations:
|
|
42
|
+
- API calls: Were they well-formed? Did the agent handle rate limits, auth errors, and timeouts?
|
|
43
|
+
- Web fetches: Did the agent fetch relevant pages? Were redundant fetches avoided?
|
|
44
|
+
- Data handling: Was response data used effectively or was it fetched and ignored?
|
|
45
|
+
- Error recovery: Did the agent retry appropriately on transient failures?
|
|
46
|
+
- Efficiency: Were batch operations used when available instead of multiple individual calls?`,
|
|
47
|
+
agent: `You are evaluating AGENT interactions — the agent's own reasoning, planning, tool discovery, and communication.
|
|
48
|
+
|
|
49
|
+
Key considerations:
|
|
50
|
+
- Planning: Did the agent form a clear plan before acting, or did it thrash?
|
|
51
|
+
- Tool discovery: Was tool/capability lookup necessary, or was it redundant exploration?
|
|
52
|
+
- Reasoning quality: Was the agent's reasoning focused and productive?
|
|
53
|
+
- Human interaction: Were questions to the user clear and necessary, or could the agent have proceeded independently?
|
|
54
|
+
- Self-correction: When the agent detected errors, did it adjust its approach efficiently?`,
|
|
55
|
+
};
|
|
56
|
+
const CATEGORY_EVAL_TEMPLATE = {
|
|
57
|
+
name: "category_eval",
|
|
58
|
+
description: "Per-category evaluation of interactions for success, weight, contextRelevance, necessity, and patterns. Run once per category (environment, service, agent) in parallel.",
|
|
59
|
+
stage: "deep_eval",
|
|
60
|
+
template: `You are an expert evaluator for AXIS, an AI agent testing framework.
|
|
61
|
+
|
|
62
|
+
You are evaluating the {{categoryName}} dimension of an agent execution. Focus ONLY on {{categoryName}} interactions, but use the full transcript context to understand the agent's overall behavior.
|
|
63
|
+
|
|
64
|
+
SCENARIO: {{scenarioName}}
|
|
65
|
+
|
|
66
|
+
TASK GIVEN TO AGENT:
|
|
67
|
+
{{prompt}}
|
|
68
|
+
|
|
69
|
+
COMPLETE SPARSE INDEX ({{totalInteractions}} total interactions, {{categoryInteractionCount}} are {{categoryName}}):
|
|
70
|
+
{{sparseLines}}
|
|
71
|
+
|
|
72
|
+
{{categoryName}} CATEGORY GUIDANCE:
|
|
73
|
+
{{categoryGuidance}}
|
|
74
|
+
|
|
75
|
+
{{categoryName}} INTERACTION DETAILS ({{categoryInteractionCount}} interactions):
|
|
76
|
+
{{interactionContent}}
|
|
77
|
+
|
|
78
|
+
RAW DATA FILES (for additional detail if needed):
|
|
79
|
+
{{dataDir}}
|
|
80
|
+
|
|
81
|
+
NOTE: Content shown above may be truncated for evaluation purposes. This does NOT mean the agent's actual tool results were truncated — evaluate based on the quality and structure of what is shown, not on apparent truncation boundaries.
|
|
82
|
+
|
|
83
|
+
EVALUATION DIMENSIONS (score each 0.0 to 1.0):
|
|
84
|
+
- success: Did the interaction complete without errors? Were the results correct and usable? Evaluate based on the actual content returned, not assumptions about what a "complete" result should look like.
|
|
85
|
+
- weight: Was the tool invocation right-sized for the operation? Evaluate whether the agent sent an appropriate amount of data to the tool and received a proportionate response. (1.0 = right-sized, 0.3 = bloated/wasteful)
|
|
86
|
+
- contextRelevance: Was the tool's output relevant and usable for the task? If the tool succeeded and the agent used the output to make progress, score 1.0. Only reduce this score if the output was genuinely irrelevant noise that the agent could not use. (1.0 = all useful/necessary, 0.0 = all noise)
|
|
87
|
+
|
|
88
|
+
Also evaluate NECESSITY for the {{categoryName}} category as a whole:
|
|
89
|
+
- necessity (0.0 to 1.0): Were the {{categoryName}} interactions necessary for the task? Evaluate only what the agent actually did — do not penalize for hypothetical steps it could have taken. 1.0 = all interactions were necessary, 0.0 = all were unnecessary.
|
|
90
|
+
- List any interaction IDs that were unnecessary.
|
|
91
|
+
|
|
92
|
+
Identify any patterns within {{categoryName}} interactions:
|
|
93
|
+
- Repeated failures or retries
|
|
94
|
+
- Redundant operations (same action performed multiple times)
|
|
95
|
+
- Excessive operations for simple tasks
|
|
96
|
+
- Wasted effort that didn't lead to progress
|
|
97
|
+
|
|
98
|
+
CONTEXT FOR EVALUATION:
|
|
99
|
+
- Tool discovery (e.g., ToolSearch, ListTools) and agent configuration reads are required infrastructure — do not flag as unnecessary unless genuinely redundant (same query repeated).
|
|
100
|
+
- Byte counts in sparse lines show total I/O transferred, not file content size. Small results are normal for write/edit confirmations.
|
|
101
|
+
|
|
102
|
+
Respond with ONLY valid JSON:
|
|
103
|
+
{
|
|
104
|
+
"audits": [
|
|
105
|
+
{"id": 1, "success": 0.9, "weight": 0.8, "contextRelevance": 0.6, "rationale": "brief explanation"},
|
|
106
|
+
...
|
|
107
|
+
],
|
|
108
|
+
"necessity": {"score": 0.85, "unnecessaryIds": [4], "rationale": "brief explanation"},
|
|
109
|
+
"patterns": [
|
|
110
|
+
{"description": "pattern description", "interactionIds": [1, 2, 3], "severity": "high"},
|
|
111
|
+
...
|
|
112
|
+
]
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
Include an audit for EVERY {{categoryName}} interaction listed in the details above.`,
|
|
116
|
+
variables: [
|
|
117
|
+
{ name: "scenarioName", description: "Name of the test scenario", type: "string" },
|
|
118
|
+
{ name: "prompt", description: "The original task prompt given to the agent", type: "text" },
|
|
119
|
+
{ name: "categoryName", description: "The category being evaluated (environment, service, or agent)", type: "string" },
|
|
120
|
+
{ name: "totalInteractions", description: "Total number of interactions across all categories", type: "number" },
|
|
121
|
+
{
|
|
122
|
+
name: "categoryInteractionCount",
|
|
123
|
+
description: "Number of interactions in this specific category",
|
|
124
|
+
type: "number",
|
|
125
|
+
},
|
|
126
|
+
{ name: "sparseLines", description: "Complete sparse index content (all categories)", type: "text" },
|
|
127
|
+
{ name: "categoryGuidance", description: "Category-specific evaluation guidance", type: "text" },
|
|
128
|
+
{
|
|
129
|
+
name: "interactionContent",
|
|
130
|
+
description: "Full formatted content for this category's interactions only",
|
|
131
|
+
type: "text",
|
|
132
|
+
},
|
|
133
|
+
{
|
|
134
|
+
name: "dataDir",
|
|
135
|
+
description: "Path to the report directory containing raw data files",
|
|
136
|
+
type: "string",
|
|
137
|
+
optional: true,
|
|
138
|
+
},
|
|
139
|
+
],
|
|
140
|
+
};
|
|
141
|
+
const GOAL_STRING_RUBRIC_TEMPLATE = {
|
|
142
|
+
name: "goal_string_rubric",
|
|
143
|
+
description: "Evaluates goal achievement when the rubric is a single string criterion.",
|
|
144
|
+
stage: "goal_achievement",
|
|
145
|
+
template: `You are an expert evaluator for an AI agent testing framework called AXIS.
|
|
146
|
+
|
|
147
|
+
An AI agent was given a task. You must evaluate how well it performed based on the evidence in its transcript.
|
|
148
|
+
|
|
149
|
+
SCENARIO: {{scenarioName}}
|
|
150
|
+
|
|
151
|
+
TASK GIVEN TO AGENT:
|
|
152
|
+
{{prompt}}
|
|
153
|
+
|
|
154
|
+
---
|
|
155
|
+
|
|
156
|
+
AGENT TRANSCRIPT (condensed):
|
|
157
|
+
{{transcript}}
|
|
158
|
+
|
|
159
|
+
---
|
|
160
|
+
|
|
161
|
+
AGENT'S FINAL RESULT:
|
|
162
|
+
{{finalResult}}
|
|
163
|
+
|
|
164
|
+
---
|
|
165
|
+
|
|
166
|
+
RUBRIC:
|
|
167
|
+
{{rubric}}
|
|
168
|
+
|
|
169
|
+
---
|
|
170
|
+
|
|
171
|
+
INSTRUCTIONS:
|
|
172
|
+
1. Review the transcript to understand what the agent did.
|
|
173
|
+
2. Where possible, independently verify the results — check the filesystem for created/modified files, visit URLs, confirm that the claimed outcomes actually exist. Do not trust the transcript alone.
|
|
174
|
+
|
|
175
|
+
When done, respond with ONLY valid JSON on its own line:
|
|
176
|
+
{"score": <0-10>, "rationale": "<1-2 sentence explanation>"}
|
|
177
|
+
|
|
178
|
+
Score guide: 0 = not met at all, 5 = partially met, 10 = fully met.`,
|
|
179
|
+
variables: [
|
|
180
|
+
{ name: "scenarioName", description: "Name of the test scenario", type: "string" },
|
|
181
|
+
{ name: "prompt", description: "The original task prompt given to the agent", type: "text" },
|
|
182
|
+
{ name: "transcript", description: "Condensed agent transcript", type: "text" },
|
|
183
|
+
{ name: "finalResult", description: "The agent's final result text", type: "text" },
|
|
184
|
+
{ name: "rubric", description: "The evaluation criterion as a single string", type: "text" },
|
|
185
|
+
],
|
|
186
|
+
};
|
|
187
|
+
const GOAL_ARRAY_RUBRIC_TEMPLATE = {
|
|
188
|
+
name: "goal_array_rubric",
|
|
189
|
+
description: "Evaluates goal achievement when the rubric has multiple weighted criteria.",
|
|
190
|
+
stage: "goal_achievement",
|
|
191
|
+
template: `You are an expert evaluator for an AI agent testing framework called AXIS.
|
|
192
|
+
|
|
193
|
+
An AI agent was given a task. You must evaluate how well it performed by reviewing its transcript AND by independently verifying the results yourself.
|
|
194
|
+
|
|
195
|
+
SCENARIO: {{scenarioName}}
|
|
196
|
+
|
|
197
|
+
TASK GIVEN TO AGENT:
|
|
198
|
+
{{prompt}}
|
|
199
|
+
|
|
200
|
+
---
|
|
201
|
+
|
|
202
|
+
AGENT TRANSCRIPT (condensed):
|
|
203
|
+
{{transcript}}
|
|
204
|
+
|
|
205
|
+
---
|
|
206
|
+
|
|
207
|
+
AGENT'S FINAL RESULT:
|
|
208
|
+
{{finalResult}}
|
|
209
|
+
|
|
210
|
+
---
|
|
211
|
+
|
|
212
|
+
RUBRIC CRITERIA:
|
|
213
|
+
{{rubricText}}
|
|
214
|
+
|
|
215
|
+
---
|
|
216
|
+
|
|
217
|
+
INSTRUCTIONS:
|
|
218
|
+
1. Review the transcript to understand what the agent did.
|
|
219
|
+
2. Where possible, independently verify the results — visit URLs, check endpoints, confirm that the claimed outcomes actually exist. Do not trust the transcript alone.
|
|
220
|
+
3. For each criterion, provide a score from 0 to 10 and a brief rationale.
|
|
221
|
+
|
|
222
|
+
Score guide: 0 = not met at all, 5 = partially met, 10 = fully met.
|
|
223
|
+
|
|
224
|
+
When done, respond with ONLY valid JSON on its own line:
|
|
225
|
+
{"grades": [{"criterion_index": 0, "score": <0-10>, "rationale": "<string>"}, ...]}`,
|
|
226
|
+
variables: [
|
|
227
|
+
{ name: "scenarioName", description: "Name of the test scenario", type: "string" },
|
|
228
|
+
{ name: "prompt", description: "The original task prompt given to the agent", type: "text" },
|
|
229
|
+
{ name: "transcript", description: "Condensed agent transcript", type: "text" },
|
|
230
|
+
{ name: "finalResult", description: "The agent's final result text", type: "text" },
|
|
231
|
+
{ name: "rubricText", description: "Formatted rubric criteria with weights", type: "text" },
|
|
232
|
+
],
|
|
233
|
+
};
|
|
234
|
+
// ---------------------------------------------------------------------------
|
|
235
|
+
// Public API
|
|
236
|
+
// ---------------------------------------------------------------------------
|
|
237
|
+
/**
|
|
238
|
+
* Return all scoring prompt templates keyed by name.
|
|
239
|
+
*
|
|
240
|
+
* The templates contain `{{variable}}` placeholders — use `interpolate()`
|
|
241
|
+
* to substitute runtime values, or display the raw template text in
|
|
242
|
+
* documentation UIs.
|
|
243
|
+
*/
|
|
244
|
+
export function getPromptTemplates() {
|
|
245
|
+
return {
|
|
246
|
+
category_eval: CATEGORY_EVAL_TEMPLATE,
|
|
247
|
+
goal_string_rubric: GOAL_STRING_RUBRIC_TEMPLATE,
|
|
248
|
+
goal_array_rubric: GOAL_ARRAY_RUBRIC_TEMPLATE,
|
|
249
|
+
};
|
|
250
|
+
}
|
|
251
|
+
//# sourceMappingURL=prompt-templates.js.map
|