@tangle-network/agent-eval 0.31.1 → 0.33.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +30 -0
- package/dist/index.d.ts +246 -3
- package/dist/index.js +318 -61
- package/dist/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/package.json +22 -12
package/dist/index.js
CHANGED
|
@@ -2900,6 +2900,11 @@ var MetricsCollector = class {
|
|
|
2900
2900
|
};
|
|
2901
2901
|
|
|
2902
2902
|
// src/driver.ts
|
|
2903
|
+
var RIGOR_STANCE = {
|
|
2904
|
+
cooperative: "Your stance: a pragmatic early adopter. You accept reasonable answers and only push back on clear gaps or outright errors.",
|
|
2905
|
+
demanding: "Your stance: an experienced professional with no time to waste. You do not accept vague, hedged, or generic answers \u2014 you expect specifics, and you say so plainly when you do not get them.",
|
|
2906
|
+
relentless: "Your stance: a senior partner reviewing this work for a client who will litigate if it is wrong. You interrogate every claim. You accept nothing undefended. You find the single weakest point in every answer and attack it. Courteous, never satisfied."
|
|
2907
|
+
};
|
|
2903
2908
|
var AgentDriver = class {
|
|
2904
2909
|
tc;
|
|
2905
2910
|
client;
|
|
@@ -2929,12 +2934,14 @@ var AgentDriver = class {
|
|
|
2929
2934
|
const conversationHistory = [];
|
|
2930
2935
|
let completed = false;
|
|
2931
2936
|
let turnsToCompletion = null;
|
|
2937
|
+
let criteriaMetAtTurn = null;
|
|
2932
2938
|
for (let turn = 1; turn <= persona.maxTurns; turn++) {
|
|
2933
2939
|
const state = await metrics.getState();
|
|
2934
2940
|
const userMessage = await this.decideNextMessage(persona, state, conversationHistory);
|
|
2935
2941
|
if (userMessage === "DONE") {
|
|
2936
2942
|
completed = true;
|
|
2937
2943
|
turnsToCompletion = turn - 1;
|
|
2944
|
+
console.log(` SIGNED OFF by simulated ${persona.role} after turn ${turn - 1}`);
|
|
2938
2945
|
break;
|
|
2939
2946
|
}
|
|
2940
2947
|
const turnStart = Date.now();
|
|
@@ -2963,11 +2970,9 @@ var AgentDriver = class {
|
|
|
2963
2970
|
console.log(
|
|
2964
2971
|
` [turn ${turn}] ${conv.completionPercent.toFixed(0)}% \u2014 ${criteriaStr} (${(latency / 1e3).toFixed(1)}s)`
|
|
2965
2972
|
);
|
|
2966
|
-
if (conv.complete) {
|
|
2967
|
-
|
|
2968
|
-
|
|
2969
|
-
console.log(` COMPLETE at turn ${turn}`);
|
|
2970
|
-
break;
|
|
2973
|
+
if (conv.complete && criteriaMetAtTurn === null) {
|
|
2974
|
+
criteriaMetAtTurn = turn;
|
|
2975
|
+
console.log(` criteria met at turn ${turn} \u2014 driver continues pressure-testing`);
|
|
2971
2976
|
}
|
|
2972
2977
|
}
|
|
2973
2978
|
const finalState = await metrics.getState();
|
|
@@ -2975,6 +2980,7 @@ var AgentDriver = class {
|
|
|
2975
2980
|
personaId: persona.id,
|
|
2976
2981
|
completed,
|
|
2977
2982
|
turnsToCompletion,
|
|
2983
|
+
criteriaMetAtTurn,
|
|
2978
2984
|
totalTurns: turnMetrics.length,
|
|
2979
2985
|
metrics: turnMetrics,
|
|
2980
2986
|
finalState,
|
|
@@ -2985,51 +2991,13 @@ var AgentDriver = class {
|
|
|
2985
2991
|
}
|
|
2986
2992
|
/** Use the driver LLM to decide what the "user" says next */
|
|
2987
2993
|
async decideNextMessage(persona, state, history) {
|
|
2988
|
-
|
|
2989
|
-
|
|
2990
|
-
|
|
2991
|
-
|
|
2992
|
-
|
|
2993
|
-
|
|
2994
|
-
role: "system",
|
|
2995
|
-
content: `You are playing the role of a ${persona.role} testing an AI agent.
|
|
2996
|
-
Your goal: ${persona.goal}
|
|
2997
|
-
|
|
2998
|
-
${this.productContext ? `Product context:
|
|
2999
|
-
${this.productContext}
|
|
3000
|
-
` : ""}
|
|
3001
|
-
Current state:
|
|
3002
|
-
- Tasks: ${state.tasks}
|
|
3003
|
-
- Events: ${state.events}
|
|
3004
|
-
- Proposals: pending=${state.proposals.pending}, approved=${state.proposals.approved}, rejected=${state.proposals.rejected}
|
|
3005
|
-
- Vault files: ${state.vaultFiles.length} (${state.vaultFiles.slice(0, 10).join(", ")}${state.vaultFiles.length > 10 ? "..." : ""})
|
|
3006
|
-
|
|
3007
|
-
Completion criteria met: ${this.describeCompletion(persona, state)}
|
|
3008
|
-
|
|
3009
|
-
Decide what to do next:
|
|
3010
|
-
1. If completion is 100% \u2014 respond with exactly "DONE"
|
|
3011
|
-
2. If a proposal is pending \u2014 approve or reject it (with reason)
|
|
3012
|
-
3. If the agent is on track \u2014 push for the next deliverable
|
|
3013
|
-
4. If the agent is off track \u2014 give specific corrective feedback
|
|
3014
|
-
5. If this is the first message \u2014 start with a clear, actionable request
|
|
3015
|
-
|
|
3016
|
-
Output ONLY your next message to the agent. Be specific. Be realistic.
|
|
3017
|
-
Don't be patient \u2014 a real ${persona.role} wouldn't accept vague answers.`
|
|
3018
|
-
},
|
|
3019
|
-
{
|
|
3020
|
-
role: "user",
|
|
3021
|
-
content: recentHistory ? `Recent conversation:
|
|
3022
|
-
${recentHistory}
|
|
3023
|
-
|
|
3024
|
-
The agent just said:
|
|
3025
|
-
${lastResponse}` : "No conversation yet. Send your opening message."
|
|
3026
|
-
}
|
|
3027
|
-
],
|
|
3028
|
-
temperature: 0.5,
|
|
3029
|
-
maxTokens: 500
|
|
2994
|
+
return decideNextUserTurn(this.tc, {
|
|
2995
|
+
persona,
|
|
2996
|
+
state,
|
|
2997
|
+
history,
|
|
2998
|
+
productContext: this.productContext,
|
|
2999
|
+
model: this.driverModel
|
|
3030
3000
|
});
|
|
3031
|
-
const content = resp.choices?.[0]?.message?.content ?? "";
|
|
3032
|
-
return content.trim();
|
|
3033
3001
|
}
|
|
3034
3002
|
/** Handle pending approvals based on persona feedback patterns */
|
|
3035
3003
|
async handleApprovals(persona, workspaceId, _state) {
|
|
@@ -3049,16 +3017,77 @@ ${lastResponse}` : "No conversation yet. Send your opening message."
|
|
|
3049
3017
|
}
|
|
3050
3018
|
}
|
|
3051
3019
|
}
|
|
3052
|
-
/** Describe which completion criteria are met */
|
|
3053
|
-
describeCompletion(persona, state) {
|
|
3054
|
-
const results = persona.completionCriteria.map((c) => {
|
|
3055
|
-
const met = c.check(state);
|
|
3056
|
-
return `${c.name}: ${met ? "MET" : "NOT MET"}`;
|
|
3057
|
-
});
|
|
3058
|
-
const metCount = results.filter((r) => r.includes("MET") && !r.includes("NOT")).length;
|
|
3059
|
-
return `${metCount}/${persona.completionCriteria.length} \u2014 ${results.join(", ")}`;
|
|
3060
|
-
}
|
|
3061
3020
|
};
|
|
3021
|
+
function describeCompletion(persona, state) {
|
|
3022
|
+
const results = persona.completionCriteria.map((c) => {
|
|
3023
|
+
const met = c.check(state);
|
|
3024
|
+
return `${c.name}: ${met ? "MET" : "NOT MET"}`;
|
|
3025
|
+
});
|
|
3026
|
+
const metCount = results.filter((r) => r.includes("MET") && !r.includes("NOT")).length;
|
|
3027
|
+
return `${metCount}/${persona.completionCriteria.length} \u2014 ${results.join(", ")}`;
|
|
3028
|
+
}
|
|
3029
|
+
function buildDriverSystemPrompt(persona, state, productContext = "") {
|
|
3030
|
+
const rigor = persona.rigor ?? "demanding";
|
|
3031
|
+
const expertise = persona.expertise ? ` You are ${persona.expertise}.` : "";
|
|
3032
|
+
const pressure = persona.pressurePoints && persona.pressurePoints.length > 0 ? `
|
|
3033
|
+
A competent ${persona.role} here MUST get the agent to address each of:
|
|
3034
|
+
${persona.pressurePoints.map((p) => ` - ${p}`).join(
|
|
3035
|
+
"\n"
|
|
3036
|
+
)}
|
|
3037
|
+
Do NOT hand these to the agent. Probe whether it surfaces them itself. If it misses one, press on exactly that gap until it delivers or demonstrably fails.
|
|
3038
|
+
` : "";
|
|
3039
|
+
const curveballs = persona.curveballs && persona.curveballs.length > 0 ? `
|
|
3040
|
+
Once the agent is coasting on easy answers, introduce ONE of these as a genuine new development \u2014 never as a quiz:
|
|
3041
|
+
${persona.curveballs.map((c) => ` - ${c}`).join("\n")}
|
|
3042
|
+
` : "";
|
|
3043
|
+
return `You are role-playing a real ${persona.role} putting an AI agent through its paces.${expertise}
|
|
3044
|
+
Your objective: ${persona.goal}
|
|
3045
|
+
You are deciding whether this agent's work is good enough to stake your professional reputation on. Assume it is not \u2014 until it proves otherwise.
|
|
3046
|
+
|
|
3047
|
+
${RIGOR_STANCE[rigor]}
|
|
3048
|
+
${productContext ? `Product context:
|
|
3049
|
+
${productContext}
|
|
3050
|
+
` : ""}Current workspace state:
|
|
3051
|
+
- Tasks: ${state.tasks} | Events: ${state.events}
|
|
3052
|
+
- Proposals: pending=${state.proposals.pending}, approved=${state.proposals.approved}, rejected=${state.proposals.rejected}
|
|
3053
|
+
- Vault files (${state.vaultFiles.length}): ${state.vaultFiles.slice(0, 10).join(", ")}${state.vaultFiles.length > 10 ? " \u2026" : ""}
|
|
3054
|
+
- Nominal task criteria: ${describeCompletion(persona, state)}
|
|
3055
|
+
${pressure}${curveballs}
|
|
3056
|
+
How to choose your next message:
|
|
3057
|
+
1. Silently judge the agent's last response the way a ${persona.role} would. Is every claim defended with a specific authority, figure, or mechanism? Or is it vague, hedged, or generic?
|
|
3058
|
+
2. If it is vague or hand-waved \u2014 do NOT move on. Name the gap and demand the specific authority / figure / mechanism. "It depends" is not an answer; force the decision.
|
|
3059
|
+
3. If it makes a claim you can challenge \u2014 challenge it. Make the agent defend or correct it.
|
|
3060
|
+
4. If it missed something a ${persona.role} would catch \u2014 press on exactly that, without naming it for the agent.
|
|
3061
|
+
5. If it is genuinely solid \u2014 escalate: go a layer deeper, or introduce a curveball.
|
|
3062
|
+
6. First message \u2014 state your situation as you really would: realistic, specific, with the messy detail, but do not coach the agent.
|
|
3063
|
+
|
|
3064
|
+
Sign-off: respond with exactly "DONE" only when a ${persona.role} would act on this work without redoing it. Nominal task completion is NOT sign-off \u2014 sloppy-but-complete still fails. If the agent never gets there, keep pushing; never sign off on weak work.
|
|
3065
|
+
|
|
3066
|
+
Output ONLY your next message to the agent \u2014 in character, first person, no meta-commentary, no stage directions.`;
|
|
3067
|
+
}
|
|
3068
|
+
async function decideNextUserTurn(tc, opts) {
|
|
3069
|
+
const { persona, state, history, productContext = "", model = "claude-sonnet-4-6" } = opts;
|
|
3070
|
+
const lastResponse = history.length > 0 ? history[history.length - 1].content.slice(0, 2e3) : "(no conversation yet \u2014 this is the first message)";
|
|
3071
|
+
const recentHistory = history.slice(-6).map((h) => `${h.role}: ${h.content.slice(0, 500)}`).join("\n\n");
|
|
3072
|
+
const resp = await tc.chat({
|
|
3073
|
+
model,
|
|
3074
|
+
messages: [
|
|
3075
|
+
{ role: "system", content: buildDriverSystemPrompt(persona, state, productContext) },
|
|
3076
|
+
{
|
|
3077
|
+
role: "user",
|
|
3078
|
+
content: recentHistory ? `Recent conversation:
|
|
3079
|
+
${recentHistory}
|
|
3080
|
+
|
|
3081
|
+
The agent's latest response:
|
|
3082
|
+
${lastResponse}` : "No conversation yet. Send your opening message \u2014 in character, phrased as this person actually would."
|
|
3083
|
+
}
|
|
3084
|
+
],
|
|
3085
|
+
temperature: 0.5,
|
|
3086
|
+
maxTokens: 700
|
|
3087
|
+
});
|
|
3088
|
+
const content = resp.choices?.[0]?.message?.content ?? "";
|
|
3089
|
+
return content.trim();
|
|
3090
|
+
}
|
|
3062
3091
|
|
|
3063
3092
|
// src/integration-gates.ts
|
|
3064
3093
|
function integrationManifestValidatedPayload(input) {
|
|
@@ -4520,6 +4549,194 @@ function pathExists(obj, path) {
|
|
|
4520
4549
|
return true;
|
|
4521
4550
|
}
|
|
4522
4551
|
|
|
4552
|
+
// src/completion-verifier.ts
|
|
4553
|
+
var STOPWORDS = /* @__PURE__ */ new Set([
|
|
4554
|
+
"the",
|
|
4555
|
+
"a",
|
|
4556
|
+
"an",
|
|
4557
|
+
"of",
|
|
4558
|
+
"for",
|
|
4559
|
+
"and",
|
|
4560
|
+
"or",
|
|
4561
|
+
"to",
|
|
4562
|
+
"in",
|
|
4563
|
+
"on",
|
|
4564
|
+
"with",
|
|
4565
|
+
"by"
|
|
4566
|
+
]);
|
|
4567
|
+
var MATCH_THRESHOLD = 0.5;
|
|
4568
|
+
var MIN_CONTENT_CHARS = 50;
|
|
4569
|
+
function tokens(s) {
|
|
4570
|
+
return new Set(
|
|
4571
|
+
s.toLowerCase().split(/[^a-z0-9]+/).filter((t) => t.length > 1 && !STOPWORDS.has(t))
|
|
4572
|
+
);
|
|
4573
|
+
}
|
|
4574
|
+
function tokenRecall(requirementText, candidateText) {
|
|
4575
|
+
const req = tokens(requirementText);
|
|
4576
|
+
if (req.size === 0) return 0;
|
|
4577
|
+
const cand = tokens(candidateText);
|
|
4578
|
+
let hit = 0;
|
|
4579
|
+
for (const t of req) if (cand.has(t)) hit++;
|
|
4580
|
+
return hit / req.size;
|
|
4581
|
+
}
|
|
4582
|
+
function artifactCandidates(req, reqIndex, artifacts) {
|
|
4583
|
+
const reqText = `${req.title} ${req.category ?? ""}`;
|
|
4584
|
+
const out = [];
|
|
4585
|
+
artifacts.forEach((a, i) => {
|
|
4586
|
+
if ((a.content ?? "").trim().length < MIN_CONTENT_CHARS) return;
|
|
4587
|
+
let score = tokenRecall(reqText, `${a.path ?? ""} ${a.kind}`);
|
|
4588
|
+
if (req.category && a.kind && req.category.toLowerCase() === a.kind.toLowerCase()) {
|
|
4589
|
+
score = Math.max(score, 1);
|
|
4590
|
+
}
|
|
4591
|
+
if (score < MATCH_THRESHOLD) return;
|
|
4592
|
+
out.push({
|
|
4593
|
+
reqIndex,
|
|
4594
|
+
itemKey: `artifact:${i}`,
|
|
4595
|
+
score,
|
|
4596
|
+
evidence: `artifact '${a.path ?? a.kind}' matched (token recall ${score.toFixed(2)})`,
|
|
4597
|
+
content: a.content ?? null
|
|
4598
|
+
});
|
|
4599
|
+
});
|
|
4600
|
+
return out;
|
|
4601
|
+
}
|
|
4602
|
+
function proposalCandidates(req, reqIndex, proposals) {
|
|
4603
|
+
const reqText = `${req.title} ${req.category ?? ""}`;
|
|
4604
|
+
const out = [];
|
|
4605
|
+
for (const p of proposals) {
|
|
4606
|
+
if (p.status !== "approved") continue;
|
|
4607
|
+
const score = tokenRecall(reqText, p.title);
|
|
4608
|
+
if (score < MATCH_THRESHOLD) continue;
|
|
4609
|
+
const body = p.content ?? "";
|
|
4610
|
+
out.push({
|
|
4611
|
+
reqIndex,
|
|
4612
|
+
itemKey: `proposal:${p.id}`,
|
|
4613
|
+
score,
|
|
4614
|
+
evidence: `approved proposal '${p.title}' matched (token recall ${score.toFixed(2)})`,
|
|
4615
|
+
content: body.trim().length >= MIN_CONTENT_CHARS ? body : null
|
|
4616
|
+
});
|
|
4617
|
+
}
|
|
4618
|
+
return out;
|
|
4619
|
+
}
|
|
4620
|
+
function toolCallCandidates(req, reqIndex, toolCalls) {
|
|
4621
|
+
const out = [];
|
|
4622
|
+
toolCalls.forEach((name, i) => {
|
|
4623
|
+
const score = tokenRecall(req.title, name);
|
|
4624
|
+
if (score < MATCH_THRESHOLD) return;
|
|
4625
|
+
out.push({
|
|
4626
|
+
reqIndex,
|
|
4627
|
+
itemKey: `tool:${i}`,
|
|
4628
|
+
score,
|
|
4629
|
+
evidence: `tool call '${name}' matched (token recall ${score.toFixed(2)})`,
|
|
4630
|
+
content: null
|
|
4631
|
+
});
|
|
4632
|
+
});
|
|
4633
|
+
return out;
|
|
4634
|
+
}
|
|
4635
|
+
async function verifyCompletion(gold, state, checkCorrectness) {
|
|
4636
|
+
if (gold.requirements.length === 0) {
|
|
4637
|
+
throw new Error(
|
|
4638
|
+
`verifyCompletion: task '${gold.taskId}' has no requirements \u2014 malformed gold spec`
|
|
4639
|
+
);
|
|
4640
|
+
}
|
|
4641
|
+
const candidates = [];
|
|
4642
|
+
gold.requirements.forEach((req, i) => {
|
|
4643
|
+
const by = req.satisfiedBy ?? "any";
|
|
4644
|
+
if (by === "artifact" || by === "any") {
|
|
4645
|
+
candidates.push(...artifactCandidates(req, i, state.artifacts));
|
|
4646
|
+
}
|
|
4647
|
+
if (by === "proposal" || by === "any") {
|
|
4648
|
+
candidates.push(...proposalCandidates(req, i, state.proposals));
|
|
4649
|
+
}
|
|
4650
|
+
if (by === "tool-call" || by === "any") {
|
|
4651
|
+
candidates.push(...toolCallCandidates(req, i, state.toolCalls));
|
|
4652
|
+
}
|
|
4653
|
+
});
|
|
4654
|
+
candidates.sort((a, b) => b.score - a.score);
|
|
4655
|
+
const assigned = /* @__PURE__ */ new Map();
|
|
4656
|
+
const itemTaken = /* @__PURE__ */ new Set();
|
|
4657
|
+
for (const c of candidates) {
|
|
4658
|
+
if (assigned.has(c.reqIndex) || itemTaken.has(c.itemKey)) continue;
|
|
4659
|
+
assigned.set(c.reqIndex, c);
|
|
4660
|
+
itemTaken.add(c.itemKey);
|
|
4661
|
+
}
|
|
4662
|
+
const requirements = [];
|
|
4663
|
+
for (let i = 0; i < gold.requirements.length; i++) {
|
|
4664
|
+
const req = gold.requirements[i];
|
|
4665
|
+
const match = assigned.get(i);
|
|
4666
|
+
const evidence = [];
|
|
4667
|
+
let correct = null;
|
|
4668
|
+
if (match) {
|
|
4669
|
+
evidence.push(match.evidence);
|
|
4670
|
+
if (match.content !== null) {
|
|
4671
|
+
const r = await checkCorrectness(req, match.content);
|
|
4672
|
+
correct = r.correct;
|
|
4673
|
+
evidence.push(`correctness: ${r.correct ? "pass" : "fail"} \u2014 ${r.reason}`);
|
|
4674
|
+
} else {
|
|
4675
|
+
evidence.push("correctness: not assessed \u2014 matched item carries no content");
|
|
4676
|
+
}
|
|
4677
|
+
} else {
|
|
4678
|
+
const by = req.satisfiedBy ?? "any";
|
|
4679
|
+
const kind = by === "any" ? "artifact/proposal/tool-call" : by;
|
|
4680
|
+
evidence.push(`no produced ${kind} matched this requirement`);
|
|
4681
|
+
}
|
|
4682
|
+
const structurallyPresent = match !== void 0;
|
|
4683
|
+
const satisfied = structurallyPresent && correct !== false;
|
|
4684
|
+
requirements.push({
|
|
4685
|
+
reqId: req.reqId,
|
|
4686
|
+
title: req.title,
|
|
4687
|
+
structurallyPresent,
|
|
4688
|
+
correct,
|
|
4689
|
+
satisfied,
|
|
4690
|
+
evidence
|
|
4691
|
+
});
|
|
4692
|
+
}
|
|
4693
|
+
const satisfiedCount = requirements.filter((r) => r.satisfied).length;
|
|
4694
|
+
return {
|
|
4695
|
+
taskId: gold.taskId,
|
|
4696
|
+
requirements,
|
|
4697
|
+
completionRate: satisfiedCount / requirements.length,
|
|
4698
|
+
fullyComplete: satisfiedCount === requirements.length
|
|
4699
|
+
};
|
|
4700
|
+
}
|
|
4701
|
+
function parseCorrectnessResponse(raw) {
|
|
4702
|
+
const match = raw.match(/\{[\s\S]*\}/);
|
|
4703
|
+
if (!match) {
|
|
4704
|
+
throw new Error(`correctness checker: no JSON object in model response: ${raw.slice(0, 200)}`);
|
|
4705
|
+
}
|
|
4706
|
+
const parsed = JSON.parse(match[0]);
|
|
4707
|
+
if (typeof parsed.correct !== "boolean") {
|
|
4708
|
+
throw new Error(`correctness checker: 'correct' is not a boolean in: ${match[0].slice(0, 200)}`);
|
|
4709
|
+
}
|
|
4710
|
+
return { correct: parsed.correct, reason: typeof parsed.reason === "string" ? parsed.reason : "" };
|
|
4711
|
+
}
|
|
4712
|
+
function createLlmCorrectnessChecker(tc, opts = {}) {
|
|
4713
|
+
const model = opts.model ?? "claude-sonnet-4-6";
|
|
4714
|
+
const maxContentChars = opts.maxContentChars ?? 8e3;
|
|
4715
|
+
return async (requirement, content) => {
|
|
4716
|
+
const resp = await tc.chat({
|
|
4717
|
+
model,
|
|
4718
|
+
messages: [
|
|
4719
|
+
{
|
|
4720
|
+
role: "system",
|
|
4721
|
+
content: 'You verify whether a produced work artifact actually fulfils a stated requirement. Judge fulfilment only \u2014 is the deliverable substantively present and on-point \u2014 not polish. A plan to do it later, a vague gesture, or a description of what should be done does NOT fulfil a requirement; the artifact must BE the deliverable. Respond with a single JSON object: {"correct": boolean, "reason": string (<= 30 words)}.'
|
|
4722
|
+
},
|
|
4723
|
+
{
|
|
4724
|
+
role: "user",
|
|
4725
|
+
content: `Requirement: ${requirement.title}
|
|
4726
|
+
${requirement.category ? `Category: ${requirement.category}
|
|
4727
|
+
` : ""}
|
|
4728
|
+
Produced artifact:
|
|
4729
|
+
${content.slice(0, maxContentChars)}`
|
|
4730
|
+
}
|
|
4731
|
+
],
|
|
4732
|
+
temperature: 0,
|
|
4733
|
+
maxTokens: 200
|
|
4734
|
+
});
|
|
4735
|
+
const raw = resp.choices?.[0]?.message?.content ?? "";
|
|
4736
|
+
return parseCorrectnessResponse(raw);
|
|
4737
|
+
};
|
|
4738
|
+
}
|
|
4739
|
+
|
|
4523
4740
|
// src/dual-agent-bench.ts
|
|
4524
4741
|
var DualAgentBench = class {
|
|
4525
4742
|
async run(config) {
|
|
@@ -5174,6 +5391,40 @@ function canonicalInstruction(value) {
|
|
|
5174
5391
|
return normalized.length === 0 ? normalized : normalized[0].toUpperCase() + normalized.slice(1);
|
|
5175
5392
|
}
|
|
5176
5393
|
|
|
5394
|
+
// src/produced-state.ts
|
|
5395
|
+
function artifactKind(mimeType) {
|
|
5396
|
+
if (!mimeType) return "file";
|
|
5397
|
+
if (mimeType.includes("json")) return "json";
|
|
5398
|
+
if (mimeType.startsWith("text/")) return "text";
|
|
5399
|
+
return "file";
|
|
5400
|
+
}
|
|
5401
|
+
function extractProducedState(events) {
|
|
5402
|
+
const artifacts = [];
|
|
5403
|
+
const proposals = [];
|
|
5404
|
+
const toolCalls = [];
|
|
5405
|
+
const seenTools = /* @__PURE__ */ new Set();
|
|
5406
|
+
for (const ev of events) {
|
|
5407
|
+
if (ev.type === "tool_call") {
|
|
5408
|
+
const name = ev.toolName;
|
|
5409
|
+
if (name && !seenTools.has(name)) {
|
|
5410
|
+
seenTools.add(name);
|
|
5411
|
+
toolCalls.push(name);
|
|
5412
|
+
}
|
|
5413
|
+
} else if (ev.type === "artifact") {
|
|
5414
|
+
const a = ev;
|
|
5415
|
+
artifacts.push({
|
|
5416
|
+
kind: artifactKind(a.mimeType),
|
|
5417
|
+
path: a.name ?? a.uri ?? a.artifactId,
|
|
5418
|
+
content: a.content ?? ""
|
|
5419
|
+
});
|
|
5420
|
+
} else if (ev.type === "proposal_created") {
|
|
5421
|
+
const p = ev;
|
|
5422
|
+
proposals.push({ id: p.proposalId, title: p.title, status: p.status ?? "pending" });
|
|
5423
|
+
}
|
|
5424
|
+
}
|
|
5425
|
+
return { artifacts, proposals, toolCalls };
|
|
5426
|
+
}
|
|
5427
|
+
|
|
5177
5428
|
// src/prompt-registry.ts
|
|
5178
5429
|
var PromptRegistry = class {
|
|
5179
5430
|
entries = /* @__PURE__ */ new Map();
|
|
@@ -9092,8 +9343,8 @@ function ratio(numerator, denominator) {
|
|
|
9092
9343
|
return denominator > 0 ? numerator / denominator : 0;
|
|
9093
9344
|
}
|
|
9094
9345
|
function tokenJaccard(a, b) {
|
|
9095
|
-
const left = new Set(
|
|
9096
|
-
const right = new Set(
|
|
9346
|
+
const left = new Set(tokens2(a));
|
|
9347
|
+
const right = new Set(tokens2(b));
|
|
9097
9348
|
if (left.size === 0 || right.size === 0) return 0;
|
|
9098
9349
|
let intersection = 0;
|
|
9099
9350
|
for (const token of left) {
|
|
@@ -9111,7 +9362,7 @@ function tagOverlap(a, b) {
|
|
|
9111
9362
|
}
|
|
9112
9363
|
return intersection / Math.max(left.size, right.size);
|
|
9113
9364
|
}
|
|
9114
|
-
function
|
|
9365
|
+
function tokens2(text) {
|
|
9115
9366
|
return normalize(text).split(/\s+/).filter((token) => token.length >= 3 && !STOP_WORDS.has(token));
|
|
9116
9367
|
}
|
|
9117
9368
|
function normalize(text) {
|
|
@@ -10545,6 +10796,7 @@ export {
|
|
|
10545
10796
|
blockingKnowledgeEval,
|
|
10546
10797
|
bonferroni,
|
|
10547
10798
|
bootstrapCi,
|
|
10799
|
+
buildDriverSystemPrompt,
|
|
10548
10800
|
buildReflectionPrompt,
|
|
10549
10801
|
buildReviewerPrompt,
|
|
10550
10802
|
buildTraceAnalystTools,
|
|
@@ -10595,6 +10847,7 @@ export {
|
|
|
10595
10847
|
createFeedbackTrajectory,
|
|
10596
10848
|
createIntentMatchJudge,
|
|
10597
10849
|
createJudgeAdapter,
|
|
10850
|
+
createLlmCorrectnessChecker,
|
|
10598
10851
|
createLlmReviewer,
|
|
10599
10852
|
createReplayFetch,
|
|
10600
10853
|
createRunCriticAdapter,
|
|
@@ -10607,6 +10860,7 @@ export {
|
|
|
10607
10860
|
createVerifierAdapter,
|
|
10608
10861
|
crossTraceDiff,
|
|
10609
10862
|
crowdingDistance,
|
|
10863
|
+
decideNextUserTurn,
|
|
10610
10864
|
decideReferenceReplayPromotion,
|
|
10611
10865
|
decideReferenceReplayRunPromotion,
|
|
10612
10866
|
defaultIsMaterial,
|
|
@@ -10637,6 +10891,7 @@ export {
|
|
|
10637
10891
|
exportRunAsOtlp,
|
|
10638
10892
|
extractAssetUrls,
|
|
10639
10893
|
extractErrorCount,
|
|
10894
|
+
extractProducedState,
|
|
10640
10895
|
feedbackTrajectoriesToDatasetScenarios,
|
|
10641
10896
|
feedbackTrajectoriesToOptimizerRows,
|
|
10642
10897
|
feedbackTrajectoryToDatasetScenario,
|
|
@@ -10714,6 +10969,7 @@ export {
|
|
|
10714
10969
|
paretoChart,
|
|
10715
10970
|
paretoFrontier,
|
|
10716
10971
|
paretoFrontierWithCrowding,
|
|
10972
|
+
parseCorrectnessResponse,
|
|
10717
10973
|
parseFeedbackTrajectoriesJsonl,
|
|
10718
10974
|
parseFindingSubject,
|
|
10719
10975
|
parseRawFinding,
|
|
@@ -10825,6 +11081,7 @@ export {
|
|
|
10825
11081
|
userQuestionsForKnowledgeGaps,
|
|
10826
11082
|
validateRunRecord,
|
|
10827
11083
|
verbosityBias,
|
|
11084
|
+
verifyCompletion,
|
|
10828
11085
|
verifyManifest,
|
|
10829
11086
|
visualDiff,
|
|
10830
11087
|
viteDeployRunner,
|