@netlify/axis 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +977 -0
- package/dist/adapters/base/acp-adapter.d.ts +44 -0
- package/dist/adapters/base/acp-adapter.d.ts.map +1 -0
- package/dist/adapters/base/acp-adapter.js +559 -0
- package/dist/adapters/base/acp-adapter.js.map +1 -0
- package/dist/adapters/base/agent-adapter.d.ts +132 -0
- package/dist/adapters/base/agent-adapter.d.ts.map +1 -0
- package/dist/adapters/base/agent-adapter.js +212 -0
- package/dist/adapters/base/agent-adapter.js.map +1 -0
- package/dist/adapters/claude-code.d.ts +3 -0
- package/dist/adapters/claude-code.d.ts.map +1 -0
- package/dist/adapters/claude-code.js +138 -0
- package/dist/adapters/claude-code.js.map +1 -0
- package/dist/adapters/claude-sdk.d.ts +11 -0
- package/dist/adapters/claude-sdk.d.ts.map +1 -0
- package/dist/adapters/claude-sdk.js +46 -0
- package/dist/adapters/claude-sdk.js.map +1 -0
- package/dist/adapters/codex.d.ts +3 -0
- package/dist/adapters/codex.d.ts.map +1 -0
- package/dist/adapters/codex.js +183 -0
- package/dist/adapters/codex.js.map +1 -0
- package/dist/adapters/gemini-acp.d.ts +11 -0
- package/dist/adapters/gemini-acp.d.ts.map +1 -0
- package/dist/adapters/gemini-acp.js +60 -0
- package/dist/adapters/gemini-acp.js.map +1 -0
- package/dist/adapters/gemini.d.ts +3 -0
- package/dist/adapters/gemini.d.ts.map +1 -0
- package/dist/adapters/gemini.js +222 -0
- package/dist/adapters/gemini.js.map +1 -0
- package/dist/adapters/goose.d.ts +3 -0
- package/dist/adapters/goose.d.ts.map +1 -0
- package/dist/adapters/goose.js +9 -0
- package/dist/adapters/goose.js.map +1 -0
- package/dist/adapters/registry.d.ts +7 -0
- package/dist/adapters/registry.d.ts.map +1 -0
- package/dist/adapters/registry.js +37 -0
- package/dist/adapters/registry.js.map +1 -0
- package/dist/adapters/utils/mcp.d.ts +23 -0
- package/dist/adapters/utils/mcp.d.ts.map +1 -0
- package/dist/adapters/utils/mcp.js +114 -0
- package/dist/adapters/utils/mcp.js.map +1 -0
- package/dist/adapters/utils/resolve.d.ts +20 -0
- package/dist/adapters/utils/resolve.d.ts.map +1 -0
- package/dist/adapters/utils/resolve.js +48 -0
- package/dist/adapters/utils/resolve.js.map +1 -0
- package/dist/adapters/utils/skills.d.ts +17 -0
- package/dist/adapters/utils/skills.d.ts.map +1 -0
- package/dist/adapters/utils/skills.js +52 -0
- package/dist/adapters/utils/skills.js.map +1 -0
- package/dist/adapters/utils/token-estimator.d.ts +21 -0
- package/dist/adapters/utils/token-estimator.d.ts.map +1 -0
- package/dist/adapters/utils/token-estimator.js +37 -0
- package/dist/adapters/utils/token-estimator.js.map +1 -0
- package/dist/baselines/diff.d.ts +9 -0
- package/dist/baselines/diff.d.ts.map +1 -0
- package/dist/baselines/diff.js +83 -0
- package/dist/baselines/diff.js.map +1 -0
- package/dist/baselines/index.d.ts +3 -0
- package/dist/baselines/index.d.ts.map +1 -0
- package/dist/baselines/index.js +3 -0
- package/dist/baselines/index.js.map +1 -0
- package/dist/baselines/store.d.ts +19 -0
- package/dist/baselines/store.d.ts.map +1 -0
- package/dist/baselines/store.js +104 -0
- package/dist/baselines/store.js.map +1 -0
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +487 -0
- package/dist/cli.js.map +1 -0
- package/dist/config/loader.d.ts +8 -0
- package/dist/config/loader.d.ts.map +1 -0
- package/dist/config/loader.js +99 -0
- package/dist/config/loader.js.map +1 -0
- package/dist/config/validator.d.ts +11 -0
- package/dist/config/validator.d.ts.map +1 -0
- package/dist/config/validator.js +203 -0
- package/dist/config/validator.js.map +1 -0
- package/dist/docs-site/_astro/cli.DDWZtG0-.css +1 -0
- package/dist/docs-site/cli/index.html +18 -0
- package/dist/docs-site/configuration/index.html +121 -0
- package/dist/docs-site/content-assets.mjs +1 -0
- package/dist/docs-site/content-modules.mjs +1 -0
- package/dist/docs-site/data-store.json +9 -0
- package/dist/docs-site/index.html +69 -0
- package/dist/docs-site/quickstart/index.html +59 -0
- package/dist/docs-site/running/index.html +87 -0
- package/dist/docs-site/scoring/index.html +135 -0
- package/dist/index.d.ts +19 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +15 -0
- package/dist/index.js.map +1 -0
- package/dist/report-ui/index.html +291 -0
- package/dist/report-ui/mock-data.json +298 -0
- package/dist/reports/html.d.ts +7 -0
- package/dist/reports/html.d.ts.map +1 -0
- package/dist/reports/html.js +27 -0
- package/dist/reports/html.js.map +1 -0
- package/dist/reports/reader.d.ts +21 -0
- package/dist/reports/reader.d.ts.map +1 -0
- package/dist/reports/reader.js +110 -0
- package/dist/reports/reader.js.map +1 -0
- package/dist/reports/writer.d.ts +14 -0
- package/dist/reports/writer.d.ts.map +1 -0
- package/dist/reports/writer.js +106 -0
- package/dist/reports/writer.js.map +1 -0
- package/dist/runner/lifecycle.d.ts +10 -0
- package/dist/runner/lifecycle.d.ts.map +1 -0
- package/dist/runner/lifecycle.js +58 -0
- package/dist/runner/lifecycle.js.map +1 -0
- package/dist/runner/runner.d.ts +34 -0
- package/dist/runner/runner.d.ts.map +1 -0
- package/dist/runner/runner.js +330 -0
- package/dist/runner/runner.js.map +1 -0
- package/dist/scoring/category-score.d.ts +52 -0
- package/dist/scoring/category-score.d.ts.map +1 -0
- package/dist/scoring/category-score.js +157 -0
- package/dist/scoring/category-score.js.map +1 -0
- package/dist/scoring/composite.d.ts +5 -0
- package/dist/scoring/composite.d.ts.map +1 -0
- package/dist/scoring/composite.js +24 -0
- package/dist/scoring/composite.js.map +1 -0
- package/dist/scoring/deep-eval.d.ts +25 -0
- package/dist/scoring/deep-eval.d.ts.map +1 -0
- package/dist/scoring/deep-eval.js +382 -0
- package/dist/scoring/deep-eval.js.map +1 -0
- package/dist/scoring/goal-achievement.d.ts +5 -0
- package/dist/scoring/goal-achievement.d.ts.map +1 -0
- package/dist/scoring/goal-achievement.js +241 -0
- package/dist/scoring/goal-achievement.js.map +1 -0
- package/dist/scoring/index.d.ts +22 -0
- package/dist/scoring/index.d.ts.map +1 -0
- package/dist/scoring/index.js +115 -0
- package/dist/scoring/index.js.map +1 -0
- package/dist/scoring/parse-json.d.ts +6 -0
- package/dist/scoring/parse-json.d.ts.map +1 -0
- package/dist/scoring/parse-json.js +18 -0
- package/dist/scoring/parse-json.js.map +1 -0
- package/dist/scoring/sparse-index.d.ts +15 -0
- package/dist/scoring/sparse-index.d.ts.map +1 -0
- package/dist/scoring/sparse-index.js +338 -0
- package/dist/scoring/sparse-index.js.map +1 -0
- package/dist/scoring/triage.d.ts +15 -0
- package/dist/scoring/triage.d.ts.map +1 -0
- package/dist/scoring/triage.js +204 -0
- package/dist/scoring/triage.js.map +1 -0
- package/dist/skills/resolver.d.ts +19 -0
- package/dist/skills/resolver.d.ts.map +1 -0
- package/dist/skills/resolver.js +95 -0
- package/dist/skills/resolver.js.map +1 -0
- package/dist/transcript/categorize.d.ts +24 -0
- package/dist/transcript/categorize.d.ts.map +1 -0
- package/dist/transcript/categorize.js +233 -0
- package/dist/transcript/categorize.js.map +1 -0
- package/dist/transcript/classify.d.ts +7 -0
- package/dist/transcript/classify.d.ts.map +1 -0
- package/dist/transcript/classify.js +32 -0
- package/dist/transcript/classify.js.map +1 -0
- package/dist/transcript/extract.d.ts +24 -0
- package/dist/transcript/extract.d.ts.map +1 -0
- package/dist/transcript/extract.js +266 -0
- package/dist/transcript/extract.js.map +1 -0
- package/dist/transcript/index.d.ts +3 -0
- package/dist/transcript/index.d.ts.map +1 -0
- package/dist/transcript/index.js +2 -0
- package/dist/transcript/index.js.map +1 -0
- package/dist/transcript/normalize.d.ts +15 -0
- package/dist/transcript/normalize.d.ts.map +1 -0
- package/dist/transcript/normalize.js +160 -0
- package/dist/transcript/normalize.js.map +1 -0
- package/dist/transcript/types.d.ts +92 -0
- package/dist/transcript/types.d.ts.map +1 -0
- package/dist/transcript/types.js +2 -0
- package/dist/transcript/types.js.map +1 -0
- package/dist/transcript/urls.d.ts +10 -0
- package/dist/transcript/urls.d.ts.map +1 -0
- package/dist/transcript/urls.js +31 -0
- package/dist/transcript/urls.js.map +1 -0
- package/dist/types/agent.d.ts +80 -0
- package/dist/types/agent.d.ts.map +1 -0
- package/dist/types/agent.js +2 -0
- package/dist/types/agent.js.map +1 -0
- package/dist/types/baseline.d.ts +65 -0
- package/dist/types/baseline.d.ts.map +1 -0
- package/dist/types/baseline.js +2 -0
- package/dist/types/baseline.js.map +1 -0
- package/dist/types/config.d.ts +76 -0
- package/dist/types/config.d.ts.map +1 -0
- package/dist/types/config.js +2 -0
- package/dist/types/config.js.map +1 -0
- package/dist/types/index.d.ts +8 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +8 -0
- package/dist/types/index.js.map +1 -0
- package/dist/types/output.d.ts +70 -0
- package/dist/types/output.d.ts.map +1 -0
- package/dist/types/output.js +15 -0
- package/dist/types/output.js.map +1 -0
- package/dist/types/report.d.ts +37 -0
- package/dist/types/report.d.ts.map +1 -0
- package/dist/types/report.js +2 -0
- package/dist/types/report.js.map +1 -0
- package/dist/types/scenario.d.ts +23 -0
- package/dist/types/scenario.d.ts.map +1 -0
- package/dist/types/scenario.js +2 -0
- package/dist/types/scenario.js.map +1 -0
- package/dist/types/scoring.d.ts +176 -0
- package/dist/types/scoring.d.ts.map +1 -0
- package/dist/types/scoring.js +2 -0
- package/dist/types/scoring.js.map +1 -0
- package/dist/ui/AnimatedTokens.d.ts +29 -0
- package/dist/ui/AnimatedTokens.d.ts.map +1 -0
- package/dist/ui/AnimatedTokens.js +53 -0
- package/dist/ui/AnimatedTokens.js.map +1 -0
- package/dist/ui/App.d.ts +6 -0
- package/dist/ui/App.d.ts.map +1 -0
- package/dist/ui/App.js +16 -0
- package/dist/ui/App.js.map +1 -0
- package/dist/ui/LiveDuration.d.ts +20 -0
- package/dist/ui/LiveDuration.d.ts.map +1 -0
- package/dist/ui/LiveDuration.js +31 -0
- package/dist/ui/LiveDuration.js.map +1 -0
- package/dist/ui/LiveStatus.d.ts +7 -0
- package/dist/ui/LiveStatus.d.ts.map +1 -0
- package/dist/ui/LiveStatus.js +52 -0
- package/dist/ui/LiveStatus.js.map +1 -0
- package/dist/ui/format.d.ts +29 -0
- package/dist/ui/format.d.ts.map +1 -0
- package/dist/ui/format.js +514 -0
- package/dist/ui/format.js.map +1 -0
- package/package.json +65 -0
|
@@ -0,0 +1,382 @@
|
|
|
1
|
+
import * as fs from "node:fs";
|
|
2
|
+
import * as os from "node:os";
|
|
3
|
+
import * as path from "node:path";
|
|
4
|
+
import { getAdapter } from "../adapters/registry.js";
|
|
5
|
+
import { DEFAULT_AUDIT_SCORES } from "./category-score.js";
|
|
6
|
+
import { parseJsonFromText } from "./parse-json.js";
|
|
7
|
+
/** Max characters of full content to include per interaction. */
|
|
8
|
+
const MAX_CONTENT_PER_INTERACTION = 3_000;
|
|
9
|
+
/** Max total content characters to send to the judge. */
|
|
10
|
+
const MAX_TOTAL_CONTENT = 40_000;
|
|
11
|
+
/** Max characters for the sparse index in the evaluation prompt. */
|
|
12
|
+
const MAX_SPARSE_INDEX_CHARS = 60_000;
|
|
13
|
+
/**
|
|
14
|
+
* Run the deep evaluation LLM pass.
|
|
15
|
+
*
|
|
16
|
+
* Speed is always computed heuristically from interaction timing data (no LLM needed).
|
|
17
|
+
* The LLM evaluates ALL interactions for success, weight, contextRelevance,
|
|
18
|
+
* and necessity per category.
|
|
19
|
+
*/
|
|
20
|
+
export async function runDeepEval(result, sparseIndex, triage, normalized) {
|
|
21
|
+
// If there are no interactions at all, return defaults
|
|
22
|
+
if (sparseIndex.interactions.length === 0) {
|
|
23
|
+
return buildDefaultResult(sparseIndex);
|
|
24
|
+
}
|
|
25
|
+
// Always call LLM to evaluate all interactions
|
|
26
|
+
const prompt = buildDeepEvalPrompt(result, sparseIndex, triage, normalized);
|
|
27
|
+
const responseText = await callJudge(result, prompt);
|
|
28
|
+
const deepResult = parseDeepEvalResponse(responseText, sparseIndex);
|
|
29
|
+
// Inject heuristic speed into ALL audits — speed is always deterministic
|
|
30
|
+
for (const audit of deepResult.audits) {
|
|
31
|
+
const interaction = sparseIndex.interactions.find((i) => i.id === audit.id);
|
|
32
|
+
if (interaction) {
|
|
33
|
+
audit.speed = computeHeuristicSpeed(interaction);
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
return deepResult;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Compute a heuristic speed score (0-1) for an interaction based on
|
|
40
|
+
* duration and category. Deterministic — no LLM needed.
|
|
41
|
+
*
|
|
42
|
+
* Thresholds are generous to account for system overhead
|
|
43
|
+
* (SDK roundtrips, sandbox setup, process spawning).
|
|
44
|
+
*/
|
|
45
|
+
export function computeHeuristicSpeed(interaction) {
|
|
46
|
+
const { durationMs, categories } = interaction;
|
|
47
|
+
// No timing data — assume efficient
|
|
48
|
+
if (durationMs === null || durationMs <= 0)
|
|
49
|
+
return 1.0;
|
|
50
|
+
const seconds = durationMs / 1000;
|
|
51
|
+
// Service interactions (API calls, web fetches): network latency expected
|
|
52
|
+
if (categories.includes("service")) {
|
|
53
|
+
if (seconds <= 2)
|
|
54
|
+
return 1.0;
|
|
55
|
+
if (seconds <= 5)
|
|
56
|
+
return 0.9;
|
|
57
|
+
if (seconds <= 10)
|
|
58
|
+
return 0.8;
|
|
59
|
+
if (seconds <= 25)
|
|
60
|
+
return 0.6;
|
|
61
|
+
return 0.4;
|
|
62
|
+
}
|
|
63
|
+
// Environment interactions (file ops, shell commands): local, should be near-instant
|
|
64
|
+
if (categories.includes("environment")) {
|
|
65
|
+
if (seconds <= 0.5)
|
|
66
|
+
return 1.0;
|
|
67
|
+
if (seconds <= 2)
|
|
68
|
+
return 0.9;
|
|
69
|
+
if (seconds <= 5)
|
|
70
|
+
return 0.8;
|
|
71
|
+
if (seconds <= 10)
|
|
72
|
+
return 0.6;
|
|
73
|
+
return 0.4;
|
|
74
|
+
}
|
|
75
|
+
// Agent thinking: reasoning latency
|
|
76
|
+
if (seconds <= 2)
|
|
77
|
+
return 1.0;
|
|
78
|
+
if (seconds <= 5)
|
|
79
|
+
return 0.9;
|
|
80
|
+
if (seconds <= 15)
|
|
81
|
+
return 0.8;
|
|
82
|
+
if (seconds <= 30)
|
|
83
|
+
return 0.6;
|
|
84
|
+
return 0.4;
|
|
85
|
+
}
|
|
86
|
+
function buildDeepEvalPrompt(result, sparseIndex, triage, normalized) {
|
|
87
|
+
const { stats } = sparseIndex;
|
|
88
|
+
// Always include the full sparse index
|
|
89
|
+
const sparseLines = truncateSparseLines(sparseIndex.lines);
|
|
90
|
+
// Include full content for ALL interactions (within budget)
|
|
91
|
+
const interactionContent = buildInteractionContent(sparseIndex, triage, normalized);
|
|
92
|
+
// Include triage context when available
|
|
93
|
+
let triageSection = "";
|
|
94
|
+
if (triage.patterns.length > 0 || Object.values(triage.categoryNotes).some((n) => n)) {
|
|
95
|
+
const patternsText = triage.patterns
|
|
96
|
+
.map((p) => `- ${p.description} (severity: ${p.severity}, interactions: #${p.interactionIds.join(", #")})`)
|
|
97
|
+
.join("\n");
|
|
98
|
+
const categories = ["environment", "service", "agent"];
|
|
99
|
+
const categoryNotesText = categories
|
|
100
|
+
.filter((c) => triage.categoryNotes[c])
|
|
101
|
+
.map((c) => `${c}: ${triage.categoryNotes[c]}`)
|
|
102
|
+
.join("\n");
|
|
103
|
+
triageSection = `
|
|
104
|
+
TRIAGE ANALYSIS:
|
|
105
|
+
${categoryNotesText || "(none)"}
|
|
106
|
+
${patternsText ? `\nPATTERNS:\n${patternsText}` : ""}
|
|
107
|
+
`;
|
|
108
|
+
}
|
|
109
|
+
return `You are an expert evaluator for AXIS, an AI agent testing framework.
|
|
110
|
+
|
|
111
|
+
You are performing a comprehensive evaluation of ALL interactions from an agent execution.
|
|
112
|
+
|
|
113
|
+
SCENARIO: ${result.scenarioName}
|
|
114
|
+
|
|
115
|
+
TASK GIVEN TO AGENT:
|
|
116
|
+
${result.prompt}
|
|
117
|
+
${triageSection}
|
|
118
|
+
COMPLETE SPARSE INDEX (${stats.totalInteractions} interactions):
|
|
119
|
+
${sparseLines}
|
|
120
|
+
|
|
121
|
+
STATS:
|
|
122
|
+
- Environment interactions: ${stats.byCategory.environment}
|
|
123
|
+
- Service interactions: ${stats.byCategory.service}
|
|
124
|
+
- Agent interactions: ${stats.byCategory.agent}
|
|
125
|
+
- Errors: ${stats.totalErrors}
|
|
126
|
+
- Total duration: ${stats.totalDurationMs}ms
|
|
127
|
+
|
|
128
|
+
FULL INTERACTION CONTENT:
|
|
129
|
+
${interactionContent}
|
|
130
|
+
|
|
131
|
+
NOTE: Content shown above may be truncated for evaluation purposes. This does NOT mean the agent's actual tool results were truncated — evaluate based on the quality and structure of what is shown, not on apparent truncation boundaries.
|
|
132
|
+
|
|
133
|
+
EVALUATION DIMENSIONS (score each 0.0 to 1.0):
|
|
134
|
+
- success: Did the interaction complete without errors? Were the results correct and usable? Evaluate based on the actual content returned, not assumptions about what a "complete" result should look like. For service calls (API requests, web fetches), if the call returned structured, usable content and the agent used it successfully, score success high — do not speculate about content that might be missing or hypothesize about JS-gated pages or truncation.
|
|
135
|
+
- weight: Was the tool invocation right-sized for the operation? Evaluate whether the agent sent an appropriate amount of data to the tool and received a proportionate response. For environment tools (file writes, edits, shell commands), judge the tool operation — not the semantic quality of the content the agent chose to write. A 2KB file write is right-sized if the agent intended to write 2KB of content. For service calls, if the call returned the data the agent needed, it is right-sized — do not penalize because a page returned fewer bytes than expected. (1.0 = right-sized, 0.3 = bloated/wasteful)
|
|
136
|
+
- contextRelevance: Was the tool's output relevant and usable for the task? If the tool succeeded and the agent used the output to make progress, score 1.0. Only reduce this score if the output was genuinely irrelevant noise that the agent could not use. Do NOT reduce this score for content quality judgments (e.g., whether a summary was condensed enough, whether fetched content was comprehensive enough) — those are evaluated by goal achievement, not here. Agent-internal operations (tool discovery, planning) are necessary infrastructure — score based on whether they were needed. (1.0 = all useful/necessary, 0.0 = all noise)
|
|
137
|
+
|
|
138
|
+
For each CATEGORY present, also evaluate necessity:
|
|
139
|
+
- necessity (0.0 to 1.0): Were the interactions that the agent performed in this category necessary? Evaluate only what the agent actually did — do not penalize for hypothetical steps it could have taken. 1.0 = all interactions were necessary, 0.0 = all were unnecessary.
|
|
140
|
+
- List any interaction IDs that were unnecessary.
|
|
141
|
+
|
|
142
|
+
CONTEXT FOR EVALUATION:
|
|
143
|
+
- Tool discovery (e.g., ToolSearch, ListTools) and agent configuration reads are required infrastructure — do not flag as unnecessary unless genuinely redundant (same query repeated).
|
|
144
|
+
- Byte counts in sparse lines show total I/O transferred, not file content size. Small results are normal for write/edit confirmations.
|
|
145
|
+
- If a service call (API request, web fetch) returned structured, usable content and the agent used it to complete the task, do not flag it for concerns about hypothetical missing content or page size.
|
|
146
|
+
|
|
147
|
+
Respond with ONLY valid JSON:
|
|
148
|
+
{
|
|
149
|
+
"audits": [
|
|
150
|
+
{"id": 1, "category": "environment", "success": 0.9, "weight": 0.8, "contextRelevance": 0.6, "rationale": "brief explanation"},
|
|
151
|
+
...
|
|
152
|
+
],
|
|
153
|
+
"necessity": [
|
|
154
|
+
{"category": "environment", "score": 0.85, "unnecessaryIds": [4], "rationale": "brief explanation"},
|
|
155
|
+
{"category": "service", "score": 0.7, "unnecessaryIds": [5, 6], "rationale": "brief explanation"},
|
|
156
|
+
{"category": "agent", "score": 0.95, "unnecessaryIds": [], "rationale": "brief explanation"}
|
|
157
|
+
]
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
Include an audit for EVERY interaction listed above.`;
|
|
161
|
+
}
|
|
162
|
+
/**
|
|
163
|
+
* Build the full content section for ALL interactions.
|
|
164
|
+
* Includes as much content as fits within the total budget.
|
|
165
|
+
*/
|
|
166
|
+
function buildInteractionContent(sparseIndex, triage, normalized) {
|
|
167
|
+
const sections = [];
|
|
168
|
+
let totalChars = 0;
|
|
169
|
+
const flagMap = new Map(triage.flaggedInteractions.map((f) => [f.id, f]));
|
|
170
|
+
for (let idx = 0; idx < sparseIndex.interactions.length; idx++) {
|
|
171
|
+
const interaction = sparseIndex.interactions[idx];
|
|
172
|
+
// Build full content from the normalized entries
|
|
173
|
+
const fullContent = interaction.entryIndices.map((i) => formatFullEntry(normalized.entries[i])).join("\n");
|
|
174
|
+
const truncatedContent = fullContent.length > MAX_CONTENT_PER_INTERACTION
|
|
175
|
+
? fullContent.slice(0, MAX_CONTENT_PER_INTERACTION) + "\n... (truncated)"
|
|
176
|
+
: fullContent;
|
|
177
|
+
if (totalChars + truncatedContent.length > MAX_TOTAL_CONTENT) {
|
|
178
|
+
sections.push(`\n... (remaining ${sparseIndex.interactions.length - idx} interactions shown only in sparse index above)`);
|
|
179
|
+
break;
|
|
180
|
+
}
|
|
181
|
+
const flag = flagMap.get(interaction.id);
|
|
182
|
+
const triageNote = flag ? ` | Triage: ${flag.reason}` : "";
|
|
183
|
+
sections.push(`---
|
|
184
|
+
#${interaction.id} | Category: ${interaction.categories.join(", ")}${triageNote}
|
|
185
|
+
${truncatedContent}
|
|
186
|
+
---`);
|
|
187
|
+
totalChars += truncatedContent.length;
|
|
188
|
+
}
|
|
189
|
+
return sections.join("\n\n");
|
|
190
|
+
}
|
|
191
|
+
function formatFullEntry(entry) {
|
|
192
|
+
const parts = [];
|
|
193
|
+
switch (entry.type) {
|
|
194
|
+
case "assistant":
|
|
195
|
+
parts.push(`[ASSISTANT] ${entry.text ?? "(no text)"}`);
|
|
196
|
+
break;
|
|
197
|
+
case "tool_use":
|
|
198
|
+
parts.push(`[TOOL_USE] ${entry.toolName ?? "unknown"}`);
|
|
199
|
+
if (entry.toolInputSummary)
|
|
200
|
+
parts.push(` Input: ${entry.toolInputSummary}`);
|
|
201
|
+
if (entry.toolInput) {
|
|
202
|
+
const inputStr = JSON.stringify(entry.toolInput);
|
|
203
|
+
parts.push(` Full input: ${inputStr.length > 1000 ? inputStr.slice(0, 1000) + "..." : inputStr}`);
|
|
204
|
+
}
|
|
205
|
+
break;
|
|
206
|
+
case "tool_result":
|
|
207
|
+
parts.push(`[TOOL_RESULT]`);
|
|
208
|
+
if (entry.toolResultText) {
|
|
209
|
+
const result = entry.toolResultText.length > 2000 ? entry.toolResultText.slice(0, 2000) + "..." : entry.toolResultText;
|
|
210
|
+
parts.push(` Result: ${result}`);
|
|
211
|
+
}
|
|
212
|
+
break;
|
|
213
|
+
case "error":
|
|
214
|
+
parts.push(`[ERROR] ${entry.errorMessage ?? entry.text ?? "(unknown error)"}`);
|
|
215
|
+
break;
|
|
216
|
+
default:
|
|
217
|
+
parts.push(`[${entry.type.toUpperCase()}] ${entry.text ?? "(no content)"}`);
|
|
218
|
+
}
|
|
219
|
+
return parts.join("\n");
|
|
220
|
+
}
|
|
221
|
+
async function callJudge(runResult, prompt) {
|
|
222
|
+
const adapter = getAdapter(runResult.agentConfig.adapter);
|
|
223
|
+
const workspace = fs.mkdtempSync(path.join(os.tmpdir(), "axis-deep-eval-"));
|
|
224
|
+
try {
|
|
225
|
+
const output = await adapter.run({
|
|
226
|
+
prompt,
|
|
227
|
+
config: runResult.agentConfig,
|
|
228
|
+
scenario: {
|
|
229
|
+
key: "__deep_eval__",
|
|
230
|
+
name: "AXIS Deep Evaluation",
|
|
231
|
+
prompt,
|
|
232
|
+
rubric: [],
|
|
233
|
+
},
|
|
234
|
+
workingDirectory: workspace,
|
|
235
|
+
});
|
|
236
|
+
return output.result ?? "";
|
|
237
|
+
}
|
|
238
|
+
finally {
|
|
239
|
+
try {
|
|
240
|
+
fs.rmSync(workspace, { recursive: true, force: true });
|
|
241
|
+
}
|
|
242
|
+
catch {
|
|
243
|
+
/* ignore */
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
// --- Response parsing ---
|
|
248
|
+
/**
|
|
249
|
+
* Parse the deep eval LLM response.
|
|
250
|
+
* Fills in default audits for interactions the LLM missed and default necessity for missing categories.
|
|
251
|
+
*/
|
|
252
|
+
export function parseDeepEvalResponse(responseText, sparseIndex) {
|
|
253
|
+
const parsed = parseJsonFromText(responseText);
|
|
254
|
+
let llmAudits = [];
|
|
255
|
+
let llmNecessity = [];
|
|
256
|
+
if (parsed) {
|
|
257
|
+
llmAudits = parseAudits(parsed.audits, sparseIndex);
|
|
258
|
+
llmNecessity = parseNecessity(parsed.necessity);
|
|
259
|
+
}
|
|
260
|
+
// Build complete audit list: LLM-scored where available, defaults for any the LLM missed
|
|
261
|
+
const auditMap = new Map(llmAudits.map((a) => [a.id, a]));
|
|
262
|
+
const allAudits = [];
|
|
263
|
+
for (const interaction of sparseIndex.interactions) {
|
|
264
|
+
const existing = auditMap.get(interaction.id);
|
|
265
|
+
if (existing) {
|
|
266
|
+
allAudits.push(existing);
|
|
267
|
+
}
|
|
268
|
+
else {
|
|
269
|
+
allAudits.push({
|
|
270
|
+
id: interaction.id,
|
|
271
|
+
categories: interaction.categories,
|
|
272
|
+
success: interaction.hasError ? 0.3 : DEFAULT_AUDIT_SCORES.success,
|
|
273
|
+
speed: DEFAULT_AUDIT_SCORES.speed,
|
|
274
|
+
weight: DEFAULT_AUDIT_SCORES.weight,
|
|
275
|
+
contextRelevance: DEFAULT_AUDIT_SCORES.contextRelevance,
|
|
276
|
+
rationale: "default",
|
|
277
|
+
});
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
// Ensure all three categories have necessity judgments
|
|
281
|
+
const categories = ["environment", "service", "agent"];
|
|
282
|
+
const necessityMap = new Map(llmNecessity.map((n) => [n.category, n]));
|
|
283
|
+
const allNecessity = categories.map((cat) => {
|
|
284
|
+
const existing = necessityMap.get(cat);
|
|
285
|
+
if (existing)
|
|
286
|
+
return existing;
|
|
287
|
+
return {
|
|
288
|
+
category: cat,
|
|
289
|
+
score: 1.0,
|
|
290
|
+
unnecessaryIds: [],
|
|
291
|
+
rationale: "default",
|
|
292
|
+
};
|
|
293
|
+
});
|
|
294
|
+
return { audits: allAudits, necessity: allNecessity };
|
|
295
|
+
}
|
|
296
|
+
function parseAudits(raw, sparseIndex) {
|
|
297
|
+
if (!Array.isArray(raw))
|
|
298
|
+
return [];
|
|
299
|
+
const interactionMap = new Map(sparseIndex.interactions.map((i) => [i.id, i]));
|
|
300
|
+
const audits = [];
|
|
301
|
+
for (const item of raw) {
|
|
302
|
+
if (!item || typeof item !== "object")
|
|
303
|
+
continue;
|
|
304
|
+
const obj = item;
|
|
305
|
+
if (typeof obj.id !== "number")
|
|
306
|
+
continue;
|
|
307
|
+
const interaction = interactionMap.get(obj.id);
|
|
308
|
+
if (!interaction)
|
|
309
|
+
continue;
|
|
310
|
+
audits.push({
|
|
311
|
+
id: obj.id,
|
|
312
|
+
categories: interaction.categories,
|
|
313
|
+
success: clamp01(obj.success),
|
|
314
|
+
speed: DEFAULT_AUDIT_SCORES.speed, // placeholder — overridden by heuristic
|
|
315
|
+
weight: clamp01(obj.weight),
|
|
316
|
+
contextRelevance: clamp01(obj.contextRelevance),
|
|
317
|
+
rationale: typeof obj.rationale === "string" ? obj.rationale : "",
|
|
318
|
+
});
|
|
319
|
+
}
|
|
320
|
+
return audits;
|
|
321
|
+
}
|
|
322
|
+
function parseNecessity(raw) {
|
|
323
|
+
if (!Array.isArray(raw))
|
|
324
|
+
return [];
|
|
325
|
+
const validCategories = new Set(["environment", "service", "agent"]);
|
|
326
|
+
const judgments = [];
|
|
327
|
+
for (const item of raw) {
|
|
328
|
+
if (!item || typeof item !== "object")
|
|
329
|
+
continue;
|
|
330
|
+
const obj = item;
|
|
331
|
+
if (typeof obj.category !== "string" || !validCategories.has(obj.category))
|
|
332
|
+
continue;
|
|
333
|
+
const unnecessaryIds = Array.isArray(obj.unnecessaryIds)
|
|
334
|
+
? obj.unnecessaryIds.filter((id) => typeof id === "number")
|
|
335
|
+
: [];
|
|
336
|
+
judgments.push({
|
|
337
|
+
category: obj.category,
|
|
338
|
+
score: clamp01(obj.score),
|
|
339
|
+
unnecessaryIds,
|
|
340
|
+
rationale: typeof obj.rationale === "string" ? obj.rationale : "",
|
|
341
|
+
});
|
|
342
|
+
}
|
|
343
|
+
return judgments;
|
|
344
|
+
}
|
|
345
|
+
function buildDefaultResult(sparseIndex) {
|
|
346
|
+
const audits = sparseIndex.interactions.map((interaction) => ({
|
|
347
|
+
id: interaction.id,
|
|
348
|
+
categories: interaction.categories,
|
|
349
|
+
success: interaction.hasError ? 0.3 : DEFAULT_AUDIT_SCORES.success,
|
|
350
|
+
speed: DEFAULT_AUDIT_SCORES.speed,
|
|
351
|
+
weight: DEFAULT_AUDIT_SCORES.weight,
|
|
352
|
+
contextRelevance: DEFAULT_AUDIT_SCORES.contextRelevance,
|
|
353
|
+
rationale: "default",
|
|
354
|
+
}));
|
|
355
|
+
const categories = ["environment", "service", "agent"];
|
|
356
|
+
const necessity = categories.map((cat) => ({
|
|
357
|
+
category: cat,
|
|
358
|
+
score: 0.8,
|
|
359
|
+
unnecessaryIds: [],
|
|
360
|
+
rationale: "default",
|
|
361
|
+
}));
|
|
362
|
+
return { audits, necessity };
|
|
363
|
+
}
|
|
364
|
+
function truncateSparseLines(lines) {
|
|
365
|
+
let totalChars = 0;
|
|
366
|
+
const included = [];
|
|
367
|
+
for (const line of lines) {
|
|
368
|
+
if (totalChars + line.length > MAX_SPARSE_INDEX_CHARS) {
|
|
369
|
+
included.push(`... (${lines.length - included.length} more interactions omitted)`);
|
|
370
|
+
break;
|
|
371
|
+
}
|
|
372
|
+
included.push(line);
|
|
373
|
+
totalChars += line.length;
|
|
374
|
+
}
|
|
375
|
+
return included.join("\n");
|
|
376
|
+
}
|
|
377
|
+
function clamp01(value) {
|
|
378
|
+
if (typeof value !== "number" || !Number.isFinite(value))
|
|
379
|
+
return 0.5;
|
|
380
|
+
return Math.max(0, Math.min(1, value));
|
|
381
|
+
}
|
|
382
|
+
//# sourceMappingURL=deep-eval.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"deep-eval.js","sourceRoot":"","sources":["../../src/scoring/deep-eval.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,EAAE,UAAU,EAAE,MAAM,yBAAyB,CAAC;AAYrD,OAAO,EAAE,oBAAoB,EAAE,MAAM,qBAAqB,CAAC;AAC3D,OAAO,EAAE,iBAAiB,EAAE,MAAM,iBAAiB,CAAC;AAEpD,iEAAiE;AACjE,MAAM,2BAA2B,GAAG,KAAK,CAAC;AAE1C,yDAAyD;AACzD,MAAM,iBAAiB,GAAG,MAAM,CAAC;AAEjC,oEAAoE;AACpE,MAAM,sBAAsB,GAAG,MAAM,CAAC;AAEtC;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,MAAiB,EACjB,WAAwB,EACxB,MAAoB,EACpB,UAAgC;IAEhC,uDAAuD;IACvD,IAAI,WAAW,CAAC,YAAY,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC1C,OAAO,kBAAkB,CAAC,WAAW,CAAC,CAAC;IACzC,CAAC;IAED,+CAA+C;IAC/C,MAAM,MAAM,GAAG,mBAAmB,CAAC,MAAM,EAAE,WAAW,EAAE,MAAM,EAAE,UAAU,CAAC,CAAC;IAC5E,MAAM,YAAY,GAAG,MAAM,SAAS,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACrD,MAAM,UAAU,GAAG,qBAAqB,CAAC,YAAY,EAAE,WAAW,CAAC,CAAC;IAEpE,yEAAyE;IACzE,KAAK,MAAM,KAAK,IAAI,UAAU,CAAC,MAAM,EAAE,CAAC;QACtC,MAAM,WAAW,GAAG,WAAW,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,KAAK,CAAC,EAAE,CAAC,CAAC;QAC5E,IAAI,WAAW,EAAE,CAAC;YAChB,KAAK,CAAC,KAAK,GAAG,qBAAqB,CAAC,WAAW,CAAC,CAAC;QACnD,CAAC;IACH,CAAC;IAED,OAAO,UAAU,CAAC;AACpB,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,qBAAqB,CAAC,WAAwB;IAC5D,MAAM,EAAE,UAAU,EAAE,UAAU,EAAE,GAAG,WAAW,CAAC;IAE/C,oCAAoC;IACpC,IAAI,UAAU,KAAK,IAAI,IAAI,UAAU,IAAI,CAAC;QAAE,OAAO,GAAG,CAAC;IAEvD,MAAM,OAAO,GAAG,UAAU,GAAG,IAAI,CAAC;IAElC,0EAA0E;IAC1E,IAAI,UAAU,CAAC,QAAQ,CAAC,SAAS,CAAC,EAAE,CAAC;QACnC,IAAI,OAAO,IAAI,CAAC;YAAE,OAAO,GAAG,CAAC;QAC7B,IAAI,OAAO,IAAI,CAAC;YAAE,OAAO,GAAG,CAAC;QAC7B,IAAI,OAAO,IAAI,EAAE;YAAE,OAAO,GAAG,CAAC;QAC9B,IAAI,OAAO,IAAI,EAAE;YAAE,OAAO,GAAG,CAAC;QAC9B,OAAO,GAAG,CAAC;IACb,CAAC;IAED,qFAAqF;IACrF,IAAI,UAAU,CAAC,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;QACvC,IAAI,OAAO,IAAI,GAAG;YAAE,OAAO,GAAG,CAAC;QAC/B,IAAI,OAAO,IAAI,CAAC;YAAE,OAAO,GAAG,CAAC;QAC7B,IAAI,OAAO,IAAI,CAAC;YAAE,OAAO,GAAG,CAAC;QAC7B,IAAI,OAAO,IAAI,EAAE;YAAE,OAAO,GAAG,CAAC;QAC9B,OAAO,GAAG,CAAC;IACb,CAAC;IAED,oCAAoC;IACpC,IAAI,OAAO,IAAI,CAAC;QAAE,OAAO,GAAG,CAAC;IAC7B,IAAI,OAAO,IAAI,CAAC;QAAE,OAAO,GAAG,CAAC;IAC7B,IAAI,OAAO,IAAI,EAAE;QAAE,OAAO,GAAG,CAAC;IAC9B,IAAI,OAAO,IAAI,EAAE;QAAE,OAAO,GAAG,CAAC;IAC9B,OAAO,GAAG,CAAC;AACb,CAAC;AAED,SAAS,mBAAmB,CAC1B,MAAiB,EACjB,WAAwB,EACxB,MAAoB,EACpB,UAAgC;IAEhC,MAAM,EAAE,KAAK,EAAE,GAAG,WAAW,CAAC;IAE9B,uCAAuC;IACvC,MAAM,WAAW,GAAG,mBAAmB,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC;IAE3D,4DAA4D;IAC5D,MAAM,kBAAkB,GAAG,uBAAuB,CAAC,WAAW,EAAE,MAAM,EAAE,UAAU,CAAC,CAAC;IAEpF,wCAAwC;IACxC,IAAI,aAAa,GAAG,EAAE,CAAC;IACvB,IAAI,MAAM,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,IAAI,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QACrF,MAAM,YAAY,GAAG,MAAM,CAAC,QAAQ;aACjC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,KAAK,CAAC,CAAC,WAAW,eAAe,CAAC,CAAC,QAAQ,oBAAoB,CAAC,CAAC,cAAc,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC;aAC1G,IAAI,CAAC,IAAI,CAAC,CAAC;QAEd,MAAM,UAAU,GAA0B,CAAC,aAAa,EAAE,SAAS,EAAE,OAAO,CAAC,CAAC;QAC9E,MAAM,iBAAiB,GAAG,UAAU;aACjC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC;aACtC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,KAAK,MAAM,CAAC,aAAa,CAAC,CAAC,CAAC,EAAE,CAAC;aAC9C,IAAI,CAAC,IAAI,CAAC,CAAC;QAEd,aAAa,GAAG;;EAElB,iBAAiB,IAAI,QAAQ;EAC7B,YAAY,CAAC,CAAC,CAAC,gBAAgB,YAAY,EAAE,CAAC,CAAC,CAAC,EAAE;CACnD,CAAC;IACA,CAAC;IAED,OAAO;;;;YAIG,MAAM,CAAC,YAAY;;;EAG7B,MAAM,CAAC,MAAM;EACb,aAAa;yBACU,KAAK,CAAC,iBAAiB;EAC9C,WAAW;;;8BAGiB,KAAK,CAAC,UAAU,CAAC,WAAW;0BAChC,KAAK,CAAC,UAAU,CAAC,OAAO;wBAC1B,KAAK,CAAC,UAAU,CAAC,KAAK;YAClC,KAAK,CAAC,WAAW;oBACT,KAAK,CAAC,eAAe;;;EAGvC,kBAAkB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;qDA+BiC,CAAC;AACtD,CAAC;AAED;;;GAGG;AACH,SAAS,uBAAuB,CAC9B,WAAwB,EACxB,MAAoB,EACpB,UAAgC;IAEhC,MAAM,QAAQ,GAAa,EAAE,CAAC;IAC9B,IAAI,UAAU,GAAG,CAAC,CAAC;IAEnB,MAAM,OAAO,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,mBAAmB,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;IAE1E,KAAK,IAAI,GAAG,GAAG,CAAC,EAAE,GAAG,GAAG,WAAW,CAAC,YAAY,CAAC,MAAM,EAAE,GAAG,EAAE,EAAE,CAAC;QAC/D,MAAM,WAAW,GAAG,WAAW,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;QAElD,iDAAiD;QACjD,MAAM,WAAW,GAAG,WAAW,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,eAAe,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAE3G,MAAM,gBAAgB,GACpB,WAAW,CAAC,MAAM,GAAG,2BAA2B;YAC9C,CAAC,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC,EAAE,2BAA2B,CAAC,GAAG,mBAAmB;YACzE,CAAC,CAAC,WAAW,CAAC;QAElB,IAAI,UAAU,GAAG,gBAAgB,CAAC,MAAM,GAAG,iBAAiB,EAAE,CAAC;YAC7D,QAAQ,CAAC,IAAI,CACX,oBAAoB,WAAW,CAAC,YAAY,CAAC,MAAM,GAAG,GAAG,iDAAiD,CAC3G,CAAC;YACF,MAAM;QACR,CAAC;QAED,MAAM,IAAI,GAAG,OAAO,CAAC,GAAG,CAAC,WAAW,CAAC,EAAE,CAAC,CAAC;QACzC,MAAM,UAAU,GAAG,IAAI,CAAC,CAAC,CAAC,cAAc,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAE3D,QAAQ,CAAC,IAAI,CAAC;GACf,WAAW,CAAC,EAAE,gBAAgB,WAAW,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,UAAU;EAC7E,gBAAgB;IACd,CAAC,CAAC;QAEF,UAAU,IAAI,gBAAgB,CAAC,MAAM,CAAC;IACxC,CAAC;IAED,OAAO,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;AAC/B,CAAC;AAED,SAAS,eAAe,CAAC,KAAsB;IAC7C,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,QAAQ,KAAK,CAAC,IAAI,EAAE,CAAC;QACnB,KAAK,WAAW;YACd,KAAK,CAAC,IAAI,CAAC,eAAe,KAAK,CAAC,IAAI,IAAI,WAAW,EAAE,CAAC,CAAC;YACvD,MAAM;QACR,KAAK,UAAU;YACb,KAAK,CAAC,IAAI,CAAC,cAAc,KAAK,CAAC,QAAQ,IAAI,SAAS,EAAE,CAAC,CAAC;YACxD,IAAI,KAAK,CAAC,gBAAgB;gBAAE,KAAK,CAAC,IAAI,CAAC,YAAY,KAAK,CAAC,gBAAgB,EAAE,CAAC,CAAC;YAC7E,IAAI,KAAK,CAAC,SAAS,EAAE,CAAC;gBACpB,MAAM,QAAQ,GAAG,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;gBACjD,KAAK,CAAC,IAAI,CAAC,iBAAiB,QAAQ,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;YACrG,CAAC;YACD,MAAM;QACR,KAAK,aAAa;YAChB,KAAK,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;YAC5B,IAAI,KAAK,CAAC,cAAc,EAAE,CAAC;gBACzB,MAAM,MAAM,GACV,KAAK,CAAC,cAAc,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,cAAc,CAAC;gBAC1G,KAAK,CAAC,IAAI,CAAC,aAAa,MAAM,EAAE,CAAC,CAAC;YACpC,CAAC;YACD,MAAM;QACR,KAAK,OAAO;YACV,KAAK,CAAC,IAAI,CAAC,WAAW,KAAK,CAAC,YAAY,IAAI,KAAK,CAAC,IAAI,IAAI,iBAAiB,EAAE,CAAC,CAAC;YAC/E,MAAM;QACR;YACE,KAAK,CAAC,IAAI,CAAC,IAAI,KAAK,CAAC,IAAI,CAAC,WAAW,EAAE,KAAK,KAAK,CAAC,IAAI,IAAI,cAAc,EAAE,CAAC,CAAC;IAChF,CAAC;IAED,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED,KAAK,UAAU,SAAS,CAAC,SAAoB,EAAE,MAAc;IAC3D,MAAM,OAAO,GAAG,UAAU,CAAC,SAAS,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;IAE1D,MAAM,SAAS,GAAG,EAAE,CAAC,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,MAAM,EAAE,EAAE,iBAAiB,CAAC,CAAC,CAAC;IAC5E,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;YAC/B,MAAM;YACN,MAAM,EAAE,SAAS,CAAC,WAAW;YAC7B,QAAQ,EAAE;gBACR,GAAG,EAAE,eAAe;gBACpB,IAAI,EAAE,sBAAsB;gBAC5B,MAAM;gBACN,MAAM,EAAE,EAAE;aACX;YACD,gBAAgB,EAAE,SAAS;SAC5B,CAAC,CAAC;QACH,OAAO,MAAM,CAAC,MAAM,IAAI,EAAE,CAAC;IAC7B,CAAC;YAAS,CAAC;QACT,IAAI,CAAC;YACH,EAAE,CAAC,MAAM,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;QACzD,CAAC;QAAC,MAAM,CAAC;YACP,YAAY;QACd,CAAC;IACH,CAAC;AACH,CAAC;AAED,2BAA2B;AAE3B;;;GAGG;AACH,MAAM,UAAU,qBAAqB,CAAC,YAAoB,EAAE,WAAwB;IAClF,MAAM,MAAM,GAAG,iBAAiB,CAAC,YAAY,CAAC,CAAC;IAE/C,IAAI,SAAS,GAAuB,EAAE,CAAC;IACvC,IAAI,YAAY,GAAwB,EAAE,CAAC;IAE3C,IAAI,MAAM,EAAE,CAAC;QACX,SAAS,GAAG,WAAW,CAAC,MAAM,CAAC,MAAM,EAAE,WAAW,CAAC,CAAC;QACpD,YAAY,GAAG,cAAc,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;IAClD,CAAC;IAED,yFAAyF;IACzF,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;IAC1D,MAAM,SAAS,GAAuB,EAAE,CAAC;IAEzC,KAAK,MAAM,WAAW,IAAI,WAAW,CAAC,YAAY,EAAE,CAAC;QACnD,MAAM,QAAQ,GAAG,QAAQ,CAAC,GAAG,CAAC,WAAW,CAAC,EAAE,CAAC,CAAC;QAC9C,IAAI,QAAQ,EAAE,CAAC;YACb,SAAS,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAC3B,CAAC;aAAM,CAAC;YACN,SAAS,CAAC,IAAI,CAAC;gBACb,EAAE,EAAE,WAAW,CAAC,EAAE;gBAClB,UAAU,EAAE,WAAW,CAAC,UAAU;gBAClC,OAAO,EAAE,WAAW,CAAC,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,oBAAoB,CAAC,OAAO;gBAClE,KAAK,EAAE,oBAAoB,CAAC,KAAK;gBACjC,MAAM,EAAE,oBAAoB,CAAC,MAAM;gBACnC,gBAAgB,EAAE,oBAAoB,CAAC,gBAAgB;gBACvD,SAAS,EAAE,SAAS;aACrB,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,uDAAuD;IACvD,MAAM,UAAU,GAA0B,CAAC,aAAa,EAAE,SAAS,EAAE,OAAO,CAAC,CAAC;IAC9E,MAAM,YAAY,GAAG,IAAI,GAAG,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;IACvE,MAAM,YAAY,GAAwB,UAAU,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE;QAC/D,MAAM,QAAQ,GAAG,YAAY,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QACvC,IAAI,QAAQ;YAAE,OAAO,QAAQ,CAAC;QAC9B,OAAO;YACL,QAAQ,EAAE,GAAG;YACb,KAAK,EAAE,GAAG;YACV,cAAc,EAAE,EAAE;YAClB,SAAS,EAAE,SAAS;SACrB,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,SAAS,EAAE,YAAY,EAAE,CAAC;AACxD,CAAC;AAED,SAAS,WAAW,CAAC,GAAY,EAAE,WAAwB;IACzD,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC;QAAE,OAAO,EAAE,CAAC;IAEnC,MAAM,cAAc,GAAG,IAAI,GAAG,CAAC,WAAW,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;IAC/E,MAAM,MAAM,GAAuB,EAAE,CAAC;IAEtC,KAAK,MAAM,IAAI,IAAI,GAAG,EAAE,CAAC;QACvB,IAAI,CAAC,IAAI,IAAI,OAAO,IAAI,KAAK,QAAQ;YAAE,SAAS;QAChD,MAAM,GAAG,GAAG,IAA+B,CAAC;QAE5C,IAAI,OAAO,GAAG,CAAC,EAAE,KAAK,QAAQ;YAAE,SAAS;QAEzC,MAAM,WAAW,GAAG,cAAc,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;QAC/C,IAAI,CAAC,WAAW;YAAE,SAAS;QAE3B,MAAM,CAAC,IAAI,CAAC;YACV,EAAE,EAAE,GAAG,CAAC,EAAE;YACV,UAAU,EAAE,WAAW,CAAC,UAAU;YAClC,OAAO,EAAE,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC;YAC7B,KAAK,EAAE,oBAAoB,CAAC,KAAK,EAAE,wCAAwC;YAC3E,MAAM,EAAE,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC;YAC3B,gBAAgB,EAAE,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC;YAC/C,SAAS,EAAE,OAAO,GAAG,CAAC,SAAS,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE;SAClE,CAAC,CAAC;IACL,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,cAAc,CAAC,GAAY;IAClC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC;QAAE,OAAO,EAAE,CAAC;IAEnC,MAAM,eAAe,GAAG,IAAI,GAAG,CAAsB,CAAC,aAAa,EAAE,SAAS,EAAE,OAAO,CAAC,CAAC,CAAC;IAC1F,MAAM,SAAS,GAAwB,EAAE,CAAC;IAE1C,KAAK,MAAM,IAAI,IAAI,GAAG,EAAE,CAAC;QACvB,IAAI,CAAC,IAAI,IAAI,OAAO,IAAI,KAAK,QAAQ;YAAE,SAAS;QAChD,MAAM,GAAG,GAAG,IAA+B,CAAC;QAE5C,IAAI,OAAO,GAAG,CAAC,QAAQ,KAAK,QAAQ,IAAI,CAAC,eAAe,CAAC,GAAG,CAAC,GAAG,CAAC,QAA+B,CAAC;YAAE,SAAS;QAE5G,MAAM,cAAc,GAAG,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC;YACtD,CAAC,CAAC,GAAG,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC,EAAE,EAAgB,EAAE,CAAC,OAAO,EAAE,KAAK,QAAQ,CAAC;YACzE,CAAC,CAAC,EAAE,CAAC;QAEP,SAAS,CAAC,IAAI,CAAC;YACb,QAAQ,EAAE,GAAG,CAAC,QAA+B;YAC7C,KAAK,EAAE,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC;YACzB,cAAc;YACd,SAAS,EAAE,OAAO,GAAG,CAAC,SAAS,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE;SAClE,CAAC,CAAC;IACL,CAAC;IAED,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,SAAS,kBAAkB,CAAC,WAAwB;IAClD,MAAM,MAAM,GAAuB,WAAW,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC;QAChF,EAAE,EAAE,WAAW,CAAC,EAAE;QAClB,UAAU,EAAE,WAAW,CAAC,UAAU;QAClC,OAAO,EAAE,WAAW,CAAC,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,oBAAoB,CAAC,OAAO;QAClE,KAAK,EAAE,oBAAoB,CAAC,KAAK;QACjC,MAAM,EAAE,oBAAoB,CAAC,MAAM;QACnC,gBAAgB,EAAE,oBAAoB,CAAC,gBAAgB;QACvD,SAAS,EAAE,SAAS;KACrB,CAAC,CAAC,CAAC;IAEJ,MAAM,UAAU,GAA0B,CAAC,aAAa,EAAE,SAAS,EAAE,OAAO,CAAC,CAAC;IAC9E,MAAM,SAAS,GAAwB,UAAU,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC;QAC9D,QAAQ,EAAE,GAAG;QACb,KAAK,EAAE,GAAG;QACV,cAAc,EAAE,EAAE;QAClB,SAAS,EAAE,SAAS;KACrB,CAAC,CAAC,CAAC;IAEJ,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,CAAC;AAC/B,CAAC;AAED,SAAS,mBAAmB,CAAC,KAAe;IAC1C,IAAI,UAAU,GAAG,CAAC,CAAC;IACnB,MAAM,QAAQ,GAAa,EAAE,CAAC;IAC9B,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,IAAI,UAAU,GAAG,IAAI,CAAC,MAAM,GAAG,sBAAsB,EAAE,CAAC;YACtD,QAAQ,CAAC,IAAI,CAAC,QAAQ,KAAK,CAAC,MAAM,GAAG,QAAQ,CAAC,MAAM,6BAA6B,CAAC,CAAC;YACnF,MAAM;QACR,CAAC;QACD,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACpB,UAAU,IAAI,IAAI,CAAC,MAAM,CAAC;IAC5B,CAAC;IACD,OAAO,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC7B,CAAC;AAED,SAAS,OAAO,CAAC,KAAc;IAC7B,IAAI,OAAO,KAAK,KAAK,QAAQ,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC;QAAE,OAAO,GAAG,CAAC;IACrE,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,CAAC;AACzC,CAAC"}
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
import type { NormalizedEntry } from "../transcript/types.js";
|
|
2
|
+
import type { RunResult } from "../types/output.js";
|
|
3
|
+
import type { GoalAchievementScore } from "../types/scoring.js";
|
|
4
|
+
export declare function scoreGoalAchievement(result: RunResult, normalizedEntries: NormalizedEntry[]): Promise<GoalAchievementScore>;
|
|
5
|
+
//# sourceMappingURL=goal-achievement.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"goal-achievement.d.ts","sourceRoot":"","sources":["../../src/scoring/goal-achievement.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,wBAAwB,CAAC;AAE9D,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,oBAAoB,CAAC;AACpD,OAAO,KAAK,EAAE,oBAAoB,EAAkB,MAAM,qBAAqB,CAAC;AAGhF,wBAAsB,oBAAoB,CACxC,MAAM,EAAE,SAAS,EACjB,iBAAiB,EAAE,eAAe,EAAE,GACnC,OAAO,CAAC,oBAAoB,CAAC,CAa/B"}
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
import * as fs from "node:fs";
|
|
2
|
+
import * as os from "node:os";
|
|
3
|
+
import * as path from "node:path";
|
|
4
|
+
import { getAdapter } from "../adapters/registry.js";
|
|
5
|
+
import { parseJsonFromText } from "./parse-json.js";
|
|
6
|
+
export async function scoreGoalAchievement(result, normalizedEntries) {
|
|
7
|
+
const { rubric } = result;
|
|
8
|
+
const { result: finalResult } = result.output;
|
|
9
|
+
if (typeof rubric === "string") {
|
|
10
|
+
return scoreStringRubric(result, rubric, normalizedEntries, finalResult);
|
|
11
|
+
}
|
|
12
|
+
if (!rubric || rubric.length === 0) {
|
|
13
|
+
return { score: 0, criteria: [] };
|
|
14
|
+
}
|
|
15
|
+
return scoreArrayRubric(result, rubric, normalizedEntries, finalResult);
|
|
16
|
+
}
|
|
17
|
+
async function scoreStringRubric(runResult, rubric, entries, finalResult) {
|
|
18
|
+
const prompt = buildStringRubricPrompt(runResult, entries, finalResult, rubric);
|
|
19
|
+
const responseText = await callJudge(runResult, prompt);
|
|
20
|
+
const parsed = parseJsonFromText(responseText);
|
|
21
|
+
if (!parsed || typeof parsed.score !== "number") {
|
|
22
|
+
return {
|
|
23
|
+
score: 0,
|
|
24
|
+
criteria: [
|
|
25
|
+
{
|
|
26
|
+
check: rubric,
|
|
27
|
+
weight: 1.0,
|
|
28
|
+
score: 0,
|
|
29
|
+
rationale: "Failed to parse judge response",
|
|
30
|
+
},
|
|
31
|
+
],
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
const score = Math.max(0, Math.min(10, Math.round(parsed.score)));
|
|
35
|
+
return {
|
|
36
|
+
score: Math.round((score / 10) * 100),
|
|
37
|
+
criteria: [
|
|
38
|
+
{
|
|
39
|
+
check: rubric,
|
|
40
|
+
weight: 1.0,
|
|
41
|
+
score,
|
|
42
|
+
rationale: parsed.rationale ?? "",
|
|
43
|
+
},
|
|
44
|
+
],
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
async function scoreArrayRubric(runResult, rubric, entries, finalResult) {
|
|
48
|
+
const prompt = buildArrayRubricPrompt(runResult, entries, finalResult, rubric);
|
|
49
|
+
const responseText = await callJudge(runResult, prompt);
|
|
50
|
+
const criteria = parseArrayJudgeResponse(responseText, rubric);
|
|
51
|
+
const score = computeWeightedScore(criteria);
|
|
52
|
+
return { score, criteria };
|
|
53
|
+
}
|
|
54
|
+
async function callJudge(runResult, prompt) {
|
|
55
|
+
const adapter = getAdapter(runResult.agentConfig.adapter);
|
|
56
|
+
const workspace = fs.mkdtempSync(path.join(os.tmpdir(), "axis-judge-"));
|
|
57
|
+
try {
|
|
58
|
+
const output = await adapter.run({
|
|
59
|
+
prompt,
|
|
60
|
+
config: runResult.agentConfig,
|
|
61
|
+
scenario: {
|
|
62
|
+
key: "__judge__",
|
|
63
|
+
name: "AXIS Judge",
|
|
64
|
+
prompt,
|
|
65
|
+
rubric: [],
|
|
66
|
+
},
|
|
67
|
+
workingDirectory: workspace,
|
|
68
|
+
});
|
|
69
|
+
return output.result ?? "";
|
|
70
|
+
}
|
|
71
|
+
finally {
|
|
72
|
+
try {
|
|
73
|
+
fs.rmSync(workspace, { recursive: true, force: true });
|
|
74
|
+
}
|
|
75
|
+
catch {
|
|
76
|
+
/* ignore */
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
/** Max characters for the condensed transcript section. */
|
|
81
|
+
const MAX_TRANSCRIPT_CHARS = 50_000;
|
|
82
|
+
/** Max characters per individual transcript entry. */
|
|
83
|
+
const MAX_ENTRY_CHARS = 2_000;
|
|
84
|
+
function buildStringRubricPrompt(result, entries, finalResult, rubric) {
|
|
85
|
+
return `You are an expert evaluator for an AI agent testing framework called AXIS.
|
|
86
|
+
|
|
87
|
+
An AI agent was given a task. You must evaluate how well it performed by reviewing its transcript AND by independently verifying the results yourself.
|
|
88
|
+
|
|
89
|
+
SCENARIO: ${result.scenarioName}
|
|
90
|
+
|
|
91
|
+
TASK GIVEN TO AGENT:
|
|
92
|
+
${getOriginalPrompt(result)}
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
AGENT TRANSCRIPT (condensed):
|
|
97
|
+
${formatTranscriptForJudge(entries)}
|
|
98
|
+
|
|
99
|
+
---
|
|
100
|
+
|
|
101
|
+
AGENT'S FINAL RESULT:
|
|
102
|
+
${finalResult ?? "(no final result)"}
|
|
103
|
+
|
|
104
|
+
---
|
|
105
|
+
|
|
106
|
+
RUBRIC:
|
|
107
|
+
${rubric}
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
INSTRUCTIONS:
|
|
112
|
+
1. Review the transcript to understand what the agent did.
|
|
113
|
+
2. Where possible, independently verify the results — visit URLs, check endpoints, confirm that the claimed outcomes actually exist. Do not trust the transcript alone.
|
|
114
|
+
3. Score based on what you can verify, not just what the agent claims.
|
|
115
|
+
|
|
116
|
+
When done, respond with ONLY valid JSON on its own line:
|
|
117
|
+
{"score": <0-10>, "rationale": "<1-2 sentence explanation>"}
|
|
118
|
+
|
|
119
|
+
Score guide: 0 = not met at all, 5 = partially met, 10 = fully met.`;
|
|
120
|
+
}
|
|
121
|
+
function buildArrayRubricPrompt(result, entries, finalResult, rubric) {
|
|
122
|
+
const rubricText = rubric.map((r, i) => `${i}. "${r.check}" (weight: ${r.weight})`).join("\n");
|
|
123
|
+
return `You are an expert evaluator for an AI agent testing framework called AXIS.
|
|
124
|
+
|
|
125
|
+
An AI agent was given a task. You must evaluate how well it performed by reviewing its transcript AND by independently verifying the results yourself.
|
|
126
|
+
|
|
127
|
+
SCENARIO: ${result.scenarioName}
|
|
128
|
+
|
|
129
|
+
TASK GIVEN TO AGENT:
|
|
130
|
+
${getOriginalPrompt(result)}
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
AGENT TRANSCRIPT (condensed):
|
|
135
|
+
${formatTranscriptForJudge(entries)}
|
|
136
|
+
|
|
137
|
+
---
|
|
138
|
+
|
|
139
|
+
AGENT'S FINAL RESULT:
|
|
140
|
+
${finalResult ?? "(no final result)"}
|
|
141
|
+
|
|
142
|
+
---
|
|
143
|
+
|
|
144
|
+
RUBRIC CRITERIA:
|
|
145
|
+
${rubricText}
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
INSTRUCTIONS:
|
|
150
|
+
1. Review the transcript to understand what the agent did.
|
|
151
|
+
2. Where possible, independently verify the results — visit URLs, check endpoints, confirm that the claimed outcomes actually exist. Do not trust the transcript alone.
|
|
152
|
+
3. For each criterion, provide a score from 0 to 10 and a brief rationale.
|
|
153
|
+
|
|
154
|
+
Score guide: 0 = not met at all, 5 = partially met, 10 = fully met.
|
|
155
|
+
|
|
156
|
+
When done, respond with ONLY valid JSON on its own line:
|
|
157
|
+
{"grades": [{"criterion_index": 0, "score": <0-10>, "rationale": "<string>"}, ...]}`;
|
|
158
|
+
}
|
|
159
|
+
function getOriginalPrompt(result) {
|
|
160
|
+
return result.prompt;
|
|
161
|
+
}
|
|
162
|
+
/**
|
|
163
|
+
* Condense normalized entries into a human-readable summary for the judge.
|
|
164
|
+
*/
|
|
165
|
+
function formatTranscriptForJudge(entries) {
|
|
166
|
+
if (entries.length === 0)
|
|
167
|
+
return "(empty transcript)";
|
|
168
|
+
const lines = [];
|
|
169
|
+
let totalChars = 0;
|
|
170
|
+
for (let i = 0; i < entries.length; i++) {
|
|
171
|
+
const condensed = condenseEntry(entries[i], i + 1);
|
|
172
|
+
if (totalChars + condensed.length > MAX_TRANSCRIPT_CHARS) {
|
|
173
|
+
const remaining = entries.length - i;
|
|
174
|
+
lines.push(`\n... (${remaining} more entries truncated for brevity)`);
|
|
175
|
+
break;
|
|
176
|
+
}
|
|
177
|
+
lines.push(condensed);
|
|
178
|
+
totalChars += condensed.length;
|
|
179
|
+
}
|
|
180
|
+
return lines.join("\n");
|
|
181
|
+
}
|
|
182
|
+
/**
|
|
183
|
+
* Condense a single normalized entry into a readable line.
|
|
184
|
+
*/
|
|
185
|
+
function condenseEntry(entry, index) {
|
|
186
|
+
switch (entry.type) {
|
|
187
|
+
case "assistant":
|
|
188
|
+
return `[${index}] ASSISTANT: ${truncate(entry.text ?? "(no text)", MAX_ENTRY_CHARS)}`;
|
|
189
|
+
case "tool_use": {
|
|
190
|
+
const name = entry.toolName ?? "unknown";
|
|
191
|
+
const input = entry.toolInputSummary ? `(${truncate(entry.toolInputSummary, 500)})` : "";
|
|
192
|
+
return `[${index}] TOOL_USE: ${name}${input}`;
|
|
193
|
+
}
|
|
194
|
+
case "tool_result":
|
|
195
|
+
return `[${index}] TOOL_RESULT: ${truncate(entry.toolResultText ?? "(no result)", MAX_ENTRY_CHARS)}`;
|
|
196
|
+
case "error":
|
|
197
|
+
return `[${index}] ERROR: ${truncate(entry.errorMessage ?? entry.text ?? "(unknown error)", MAX_ENTRY_CHARS)}`;
|
|
198
|
+
case "system":
|
|
199
|
+
return `[${index}] SYSTEM: ${truncate(entry.text ?? "(no content)", 500)}`;
|
|
200
|
+
case "user":
|
|
201
|
+
return `[${index}] USER: ${truncate(entry.text ?? "(no content)", MAX_ENTRY_CHARS)}`;
|
|
202
|
+
default:
|
|
203
|
+
return `[${index}] ${entry.type}: ${truncate(entry.text ?? "(no content)", 500)}`;
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
function truncate(text, maxLen) {
|
|
207
|
+
if (text.length <= maxLen)
|
|
208
|
+
return text;
|
|
209
|
+
return text.slice(0, maxLen) + "...";
|
|
210
|
+
}
|
|
211
|
+
function parseArrayJudgeResponse(responseText, rubric) {
|
|
212
|
+
const parsed = parseJsonFromText(responseText);
|
|
213
|
+
if (!parsed || !Array.isArray(parsed.grades)) {
|
|
214
|
+
return rubric.map((r) => ({
|
|
215
|
+
check: r.check,
|
|
216
|
+
weight: r.weight,
|
|
217
|
+
score: 0,
|
|
218
|
+
rationale: "Failed to parse judge response",
|
|
219
|
+
}));
|
|
220
|
+
}
|
|
221
|
+
const grades = parsed.grades;
|
|
222
|
+
return rubric.map((r, i) => {
|
|
223
|
+
const grade = grades.find((g) => g.criterion_index === i);
|
|
224
|
+
return {
|
|
225
|
+
check: r.check,
|
|
226
|
+
weight: r.weight,
|
|
227
|
+
score: grade ? Math.max(0, Math.min(10, Math.round(grade.score))) : 0,
|
|
228
|
+
rationale: grade?.rationale ?? "No grade provided",
|
|
229
|
+
};
|
|
230
|
+
});
|
|
231
|
+
}
|
|
232
|
+
function computeWeightedScore(criteria) {
|
|
233
|
+
if (criteria.length === 0)
|
|
234
|
+
return 0;
|
|
235
|
+
const totalWeight = criteria.reduce((sum, c) => sum + c.weight, 0);
|
|
236
|
+
if (totalWeight === 0)
|
|
237
|
+
return 0;
|
|
238
|
+
const weightedSum = criteria.reduce((sum, c) => sum + (c.score / 10) * c.weight, 0);
|
|
239
|
+
return Math.round((weightedSum / totalWeight) * 100);
|
|
240
|
+
}
|
|
241
|
+
//# sourceMappingURL=goal-achievement.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"goal-achievement.js","sourceRoot":"","sources":["../../src/scoring/goal-achievement.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,EAAE,UAAU,EAAE,MAAM,yBAAyB,CAAC;AAKrD,OAAO,EAAE,iBAAiB,EAAE,MAAM,iBAAiB,CAAC;AAEpD,MAAM,CAAC,KAAK,UAAU,oBAAoB,CACxC,MAAiB,EACjB,iBAAoC;IAEpC,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,CAAC;IAC1B,MAAM,EAAE,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,CAAC,MAAM,CAAC;IAE9C,IAAI,OAAO,MAAM,KAAK,QAAQ,EAAE,CAAC;QAC/B,OAAO,iBAAiB,CAAC,MAAM,EAAE,MAAM,EAAE,iBAAiB,EAAE,WAAW,CAAC,CAAC;IAC3E,CAAC;IAED,IAAI,CAAC,MAAM,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACnC,OAAO,EAAE,KAAK,EAAE,CAAC,EAAE,QAAQ,EAAE,EAAE,EAAE,CAAC;IACpC,CAAC;IAED,OAAO,gBAAgB,CAAC,MAAM,EAAE,MAAM,EAAE,iBAAiB,EAAE,WAAW,CAAC,CAAC;AAC1E,CAAC;AAED,KAAK,UAAU,iBAAiB,CAC9B,SAAoB,EACpB,MAAc,EACd,OAA0B,EAC1B,WAA0B;IAE1B,MAAM,MAAM,GAAG,uBAAuB,CAAC,SAAS,EAAE,OAAO,EAAE,WAAW,EAAE,MAAM,CAAC,CAAC;IAChF,MAAM,YAAY,GAAG,MAAM,SAAS,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IAExD,MAAM,MAAM,GAAG,iBAAiB,CAAC,YAAY,CAAC,CAAC;IAC/C,IAAI,CAAC,MAAM,IAAI,OAAO,MAAM,CAAC,KAAK,KAAK,QAAQ,EAAE,CAAC;QAChD,OAAO;YACL,KAAK,EAAE,CAAC;YACR,QAAQ,EAAE;gBACR;oBACE,KAAK,EAAE,MAAM;oBACb,MAAM,EAAE,GAAG;oBACX,KAAK,EAAE,CAAC;oBACR,SAAS,EAAE,gCAAgC;iBAC5C;aACF;SACF,CAAC;IACJ,CAAC;IAED,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;IAClE,OAAO;QACL,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,KAAK,GAAG,EAAE,CAAC,GAAG,GAAG,CAAC;QACrC,QAAQ,EAAE;YACR;gBACE,KAAK,EAAE,MAAM;gBACb,MAAM,EAAE,GAAG;gBACX,KAAK;gBACL,SAAS,EAAG,MAAM,CAAC,SAAoB,IAAI,EAAE;aAC9C;SACF;KACF,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,gBAAgB,CAC7B,SAAoB,EACpB,MAAyB,EACzB,OAA0B,EAC1B,WAA0B;IAE1B,MAAM,MAAM,GAAG,sBAAsB,CAAC,SAAS,EAAE,OAAO,EAAE,WAAW,EAAE,MAAM,CAAC,CAAC;IAC/E,MAAM,YAAY,GAAG,MAAM,SAAS,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IAExD,MAAM,QAAQ,GAAG,uBAAuB,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC;IAC/D,MAAM,KAAK,GAAG,oBAAoB,CAAC,QAAQ,CAAC,CAAC;IAE7C,OAAO,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;AAC7B,CAAC;AAED,KAAK,UAAU,SAAS,CAAC,SAAoB,EAAE,MAAc;IAC3D,MAAM,OAAO,GAAG,UAAU,CAAC,SAAS,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;IAE1D,MAAM,SAAS,GAAG,EAAE,CAAC,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,MAAM,EAAE,EAAE,aAAa,CAAC,CAAC,CAAC;IACxE,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;YAC/B,MAAM;YACN,MAAM,EAAE,SAAS,CAAC,WAAW;YAC7B,QAAQ,EAAE;gBACR,GAAG,EAAE,WAAW;gBAChB,IAAI,EAAE,YAAY;gBAClB,MAAM;gBACN,MAAM,EAAE,EAAE;aACX;YACD,gBAAgB,EAAE,SAAS;SAC5B,CAAC,CAAC;QACH,OAAO,MAAM,CAAC,MAAM,IAAI,EAAE,CAAC;IAC7B,CAAC;YAAS,CAAC;QACT,IAAI,CAAC;YACH,EAAE,CAAC,MAAM,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;QACzD,CAAC;QAAC,MAAM,CAAC;YACP,YAAY;QACd,CAAC;IACH,CAAC;AACH,CAAC;AAED,2DAA2D;AAC3D,MAAM,oBAAoB,GAAG,MAAM,CAAC;AAEpC,sDAAsD;AACtD,MAAM,eAAe,GAAG,KAAK,CAAC;AAE9B,SAAS,uBAAuB,CAC9B,MAAiB,EACjB,OAA0B,EAC1B,WAA0B,EAC1B,MAAc;IAEd,OAAO;;;;YAIG,MAAM,CAAC,YAAY;;;EAG7B,iBAAiB,CAAC,MAAM,CAAC;;;;;EAKzB,wBAAwB,CAAC,OAAO,CAAC;;;;;EAKjC,WAAW,IAAI,mBAAmB;;;;;EAKlC,MAAM;;;;;;;;;;;;oEAY4D,CAAC;AACrE,CAAC;AAED,SAAS,sBAAsB,CAC7B,MAAiB,EACjB,OAA0B,EAC1B,WAA0B,EAC1B,MAAyB;IAEzB,MAAM,UAAU,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,KAAK,cAAc,CAAC,CAAC,MAAO,GAAG,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAEhG,OAAO;;;;YAIG,MAAM,CAAC,YAAY;;;EAG7B,iBAAiB,CAAC,MAAM,CAAC;;;;;EAKzB,wBAAwB,CAAC,OAAO,CAAC;;;;;EAKjC,WAAW,IAAI,mBAAmB;;;;;EAKlC,UAAU;;;;;;;;;;;;oFAYwE,CAAC;AACrF,CAAC;AAED,SAAS,iBAAiB,CAAC,MAAiB;IAC1C,OAAO,MAAM,CAAC,MAAM,CAAC;AACvB,CAAC;AAED;;GAEG;AACH,SAAS,wBAAwB,CAAC,OAA0B;IAC1D,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,oBAAoB,CAAC;IAEtD,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,IAAI,UAAU,GAAG,CAAC,CAAC;IAEnB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACxC,MAAM,SAAS,GAAG,aAAa,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;QAEnD,IAAI,UAAU,GAAG,SAAS,CAAC,MAAM,GAAG,oBAAoB,EAAE,CAAC;YACzD,MAAM,SAAS,GAAG,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC;YACrC,KAAK,CAAC,IAAI,CAAC,UAAU,SAAS,sCAAsC,CAAC,CAAC;YACtE,MAAM;QACR,CAAC;QAED,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QACtB,UAAU,IAAI,SAAS,CAAC,MAAM,CAAC;IACjC,CAAC;IAED,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED;;GAEG;AACH,SAAS,aAAa,CAAC,KAAsB,EAAE,KAAa;IAC1D,QAAQ,KAAK,CAAC,IAAI,EAAE,CAAC;QACnB,KAAK,WAAW;YACd,OAAO,IAAI,KAAK,gBAAgB,QAAQ,CAAC,KAAK,CAAC,IAAI,IAAI,WAAW,EAAE,eAAe,CAAC,EAAE,CAAC;QACzF,KAAK,UAAU,CAAC,CAAC,CAAC;YAChB,MAAM,IAAI,GAAG,KAAK,CAAC,QAAQ,IAAI,SAAS,CAAC;YACzC,MAAM,KAAK,GAAG,KAAK,CAAC,gBAAgB,CAAC,CAAC,CAAC,IAAI,QAAQ,CAAC,KAAK,CAAC,gBAAgB,EAAE,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;YACzF,OAAO,IAAI,KAAK,eAAe,IAAI,GAAG,KAAK,EAAE,CAAC;QAChD,CAAC;QACD,KAAK,aAAa;YAChB,OAAO,IAAI,KAAK,kBAAkB,QAAQ,CAAC,KAAK,CAAC,cAAc,IAAI,aAAa,EAAE,eAAe,CAAC,EAAE,CAAC;QACvG,KAAK,OAAO;YACV,OAAO,IAAI,KAAK,YAAY,QAAQ,CAAC,KAAK,CAAC,YAAY,IAAI,KAAK,CAAC,IAAI,IAAI,iBAAiB,EAAE,eAAe,CAAC,EAAE,CAAC;QACjH,KAAK,QAAQ;YACX,OAAO,IAAI,KAAK,aAAa,QAAQ,CAAC,KAAK,CAAC,IAAI,IAAI,cAAc,EAAE,GAAG,CAAC,EAAE,CAAC;QAC7E,KAAK,MAAM;YACT,OAAO,IAAI,KAAK,WAAW,QAAQ,CAAC,KAAK,CAAC,IAAI,IAAI,cAAc,EAAE,eAAe,CAAC,EAAE,CAAC;QACvF;YACE,OAAO,IAAI,KAAK,KAAK,KAAK,CAAC,IAAI,KAAK,QAAQ,CAAC,KAAK,CAAC,IAAI,IAAI,cAAc,EAAE,GAAG,CAAC,EAAE,CAAC;IACtF,CAAC;AACH,CAAC;AAED,SAAS,QAAQ,CAAC,IAAY,EAAE,MAAc;IAC5C,IAAI,IAAI,CAAC,MAAM,IAAI,MAAM;QAAE,OAAO,IAAI,CAAC;IACvC,OAAO,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,MAAM,CAAC,GAAG,KAAK,CAAC;AACvC,CAAC;AAED,SAAS,uBAAuB,CAAC,YAAoB,EAAE,MAAyB;IAC9E,MAAM,MAAM,GAAG,iBAAiB,CAAC,YAAY,CAAC,CAAC;IAC/C,IAAI,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,MAAM,CAAC,EAAE,CAAC;QAC7C,OAAO,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACxB,KAAK,EAAE,CAAC,CAAC,KAAK;YACd,MAAM,EAAE,CAAC,CAAC,MAAO;YACjB,KAAK,EAAE,CAAC;YACR,SAAS,EAAE,gCAAgC;SAC5C,CAAC,CAAC,CAAC;IACN,CAAC;IAED,MAAM,MAAM,GAAG,MAAM,CAAC,MAIpB,CAAC;IAEH,OAAO,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QACzB,MAAM,KAAK,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,eAAe,KAAK,CAAC,CAAC,CAAC;QAC1D,OAAO;YACL,KAAK,EAAE,CAAC,CAAC,KAAK;YACd,MAAM,EAAE,CAAC,CAAC,MAAO;YACjB,KAAK,EAAE,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YACrE,SAAS,EAAE,KAAK,EAAE,SAAS,IAAI,mBAAmB;SACnD,CAAC;IACJ,CAAC,CAAC,CAAC;AACL,CAAC;AAED,SAAS,oBAAoB,CAAC,QAA0B;IACtD,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IAEpC,MAAM,WAAW,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IACnE,IAAI,WAAW,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IAEhC,MAAM,WAAW,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,KAAK,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IAEpF,OAAO,IAAI,CAAC,KAAK,CAAC,CAAC,WAAW,GAAG,WAAW,CAAC,GAAG,GAAG,CAAC,CAAC;AACvD,CAAC"}
|