@tangle-network/agent-eval 0.31.1 → 0.32.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +225 -3
- package/dist/index.js +292 -43
- package/dist/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -2900,6 +2900,11 @@ var MetricsCollector = class {
|
|
|
2900
2900
|
};
|
|
2901
2901
|
|
|
2902
2902
|
// src/driver.ts
|
|
2903
|
+
var RIGOR_STANCE = {
|
|
2904
|
+
cooperative: "Your stance: a pragmatic early adopter. You accept reasonable answers and only push back on clear gaps or outright errors.",
|
|
2905
|
+
demanding: "Your stance: an experienced professional with no time to waste. You do not accept vague, hedged, or generic answers \u2014 you expect specifics, and you say so plainly when you do not get them.",
|
|
2906
|
+
relentless: "Your stance: a senior partner reviewing this work for a client who will litigate if it is wrong. You interrogate every claim. You accept nothing undefended. You find the single weakest point in every answer and attack it. Courteous, never satisfied."
|
|
2907
|
+
};
|
|
2903
2908
|
var AgentDriver = class {
|
|
2904
2909
|
tc;
|
|
2905
2910
|
client;
|
|
@@ -2929,12 +2934,14 @@ var AgentDriver = class {
|
|
|
2929
2934
|
const conversationHistory = [];
|
|
2930
2935
|
let completed = false;
|
|
2931
2936
|
let turnsToCompletion = null;
|
|
2937
|
+
let criteriaMetAtTurn = null;
|
|
2932
2938
|
for (let turn = 1; turn <= persona.maxTurns; turn++) {
|
|
2933
2939
|
const state = await metrics.getState();
|
|
2934
2940
|
const userMessage = await this.decideNextMessage(persona, state, conversationHistory);
|
|
2935
2941
|
if (userMessage === "DONE") {
|
|
2936
2942
|
completed = true;
|
|
2937
2943
|
turnsToCompletion = turn - 1;
|
|
2944
|
+
console.log(` SIGNED OFF by simulated ${persona.role} after turn ${turn - 1}`);
|
|
2938
2945
|
break;
|
|
2939
2946
|
}
|
|
2940
2947
|
const turnStart = Date.now();
|
|
@@ -2963,11 +2970,9 @@ var AgentDriver = class {
|
|
|
2963
2970
|
console.log(
|
|
2964
2971
|
` [turn ${turn}] ${conv.completionPercent.toFixed(0)}% \u2014 ${criteriaStr} (${(latency / 1e3).toFixed(1)}s)`
|
|
2965
2972
|
);
|
|
2966
|
-
if (conv.complete) {
|
|
2967
|
-
|
|
2968
|
-
|
|
2969
|
-
console.log(` COMPLETE at turn ${turn}`);
|
|
2970
|
-
break;
|
|
2973
|
+
if (conv.complete && criteriaMetAtTurn === null) {
|
|
2974
|
+
criteriaMetAtTurn = turn;
|
|
2975
|
+
console.log(` criteria met at turn ${turn} \u2014 driver continues pressure-testing`);
|
|
2971
2976
|
}
|
|
2972
2977
|
}
|
|
2973
2978
|
const finalState = await metrics.getState();
|
|
@@ -2975,6 +2980,7 @@ var AgentDriver = class {
|
|
|
2975
2980
|
personaId: persona.id,
|
|
2976
2981
|
completed,
|
|
2977
2982
|
turnsToCompletion,
|
|
2983
|
+
criteriaMetAtTurn,
|
|
2978
2984
|
totalTurns: turnMetrics.length,
|
|
2979
2985
|
metrics: turnMetrics,
|
|
2980
2986
|
finalState,
|
|
@@ -2992,41 +2998,19 @@ var AgentDriver = class {
|
|
|
2992
2998
|
messages: [
|
|
2993
2999
|
{
|
|
2994
3000
|
role: "system",
|
|
2995
|
-
content:
|
|
2996
|
-
Your goal: ${persona.goal}
|
|
2997
|
-
|
|
2998
|
-
${this.productContext ? `Product context:
|
|
2999
|
-
${this.productContext}
|
|
3000
|
-
` : ""}
|
|
3001
|
-
Current state:
|
|
3002
|
-
- Tasks: ${state.tasks}
|
|
3003
|
-
- Events: ${state.events}
|
|
3004
|
-
- Proposals: pending=${state.proposals.pending}, approved=${state.proposals.approved}, rejected=${state.proposals.rejected}
|
|
3005
|
-
- Vault files: ${state.vaultFiles.length} (${state.vaultFiles.slice(0, 10).join(", ")}${state.vaultFiles.length > 10 ? "..." : ""})
|
|
3006
|
-
|
|
3007
|
-
Completion criteria met: ${this.describeCompletion(persona, state)}
|
|
3008
|
-
|
|
3009
|
-
Decide what to do next:
|
|
3010
|
-
1. If completion is 100% \u2014 respond with exactly "DONE"
|
|
3011
|
-
2. If a proposal is pending \u2014 approve or reject it (with reason)
|
|
3012
|
-
3. If the agent is on track \u2014 push for the next deliverable
|
|
3013
|
-
4. If the agent is off track \u2014 give specific corrective feedback
|
|
3014
|
-
5. If this is the first message \u2014 start with a clear, actionable request
|
|
3015
|
-
|
|
3016
|
-
Output ONLY your next message to the agent. Be specific. Be realistic.
|
|
3017
|
-
Don't be patient \u2014 a real ${persona.role} wouldn't accept vague answers.`
|
|
3001
|
+
content: buildDriverSystemPrompt(persona, state, this.productContext)
|
|
3018
3002
|
},
|
|
3019
3003
|
{
|
|
3020
3004
|
role: "user",
|
|
3021
3005
|
content: recentHistory ? `Recent conversation:
|
|
3022
3006
|
${recentHistory}
|
|
3023
3007
|
|
|
3024
|
-
The agent
|
|
3025
|
-
${lastResponse}` : "No conversation yet. Send your opening message."
|
|
3008
|
+
The agent's latest response:
|
|
3009
|
+
${lastResponse}` : "No conversation yet. Send your opening message \u2014 in character, phrased as this person actually would."
|
|
3026
3010
|
}
|
|
3027
3011
|
],
|
|
3028
3012
|
temperature: 0.5,
|
|
3029
|
-
maxTokens:
|
|
3013
|
+
maxTokens: 700
|
|
3030
3014
|
});
|
|
3031
3015
|
const content = resp.choices?.[0]?.message?.content ?? "";
|
|
3032
3016
|
return content.trim();
|
|
@@ -3049,16 +3033,54 @@ ${lastResponse}` : "No conversation yet. Send your opening message."
|
|
|
3049
3033
|
}
|
|
3050
3034
|
}
|
|
3051
3035
|
}
|
|
3052
|
-
/** Describe which completion criteria are met */
|
|
3053
|
-
describeCompletion(persona, state) {
|
|
3054
|
-
const results = persona.completionCriteria.map((c) => {
|
|
3055
|
-
const met = c.check(state);
|
|
3056
|
-
return `${c.name}: ${met ? "MET" : "NOT MET"}`;
|
|
3057
|
-
});
|
|
3058
|
-
const metCount = results.filter((r) => r.includes("MET") && !r.includes("NOT")).length;
|
|
3059
|
-
return `${metCount}/${persona.completionCriteria.length} \u2014 ${results.join(", ")}`;
|
|
3060
|
-
}
|
|
3061
3036
|
};
|
|
3037
|
+
function describeCompletion(persona, state) {
|
|
3038
|
+
const results = persona.completionCriteria.map((c) => {
|
|
3039
|
+
const met = c.check(state);
|
|
3040
|
+
return `${c.name}: ${met ? "MET" : "NOT MET"}`;
|
|
3041
|
+
});
|
|
3042
|
+
const metCount = results.filter((r) => r.includes("MET") && !r.includes("NOT")).length;
|
|
3043
|
+
return `${metCount}/${persona.completionCriteria.length} \u2014 ${results.join(", ")}`;
|
|
3044
|
+
}
|
|
3045
|
+
function buildDriverSystemPrompt(persona, state, productContext = "") {
|
|
3046
|
+
const rigor = persona.rigor ?? "demanding";
|
|
3047
|
+
const expertise = persona.expertise ? ` You are ${persona.expertise}.` : "";
|
|
3048
|
+
const pressure = persona.pressurePoints && persona.pressurePoints.length > 0 ? `
|
|
3049
|
+
A competent ${persona.role} here MUST get the agent to address each of:
|
|
3050
|
+
${persona.pressurePoints.map((p) => ` - ${p}`).join(
|
|
3051
|
+
"\n"
|
|
3052
|
+
)}
|
|
3053
|
+
Do NOT hand these to the agent. Probe whether it surfaces them itself. If it misses one, press on exactly that gap until it delivers or demonstrably fails.
|
|
3054
|
+
` : "";
|
|
3055
|
+
const curveballs = persona.curveballs && persona.curveballs.length > 0 ? `
|
|
3056
|
+
Once the agent is coasting on easy answers, introduce ONE of these as a genuine new development \u2014 never as a quiz:
|
|
3057
|
+
${persona.curveballs.map((c) => ` - ${c}`).join("\n")}
|
|
3058
|
+
` : "";
|
|
3059
|
+
return `You are role-playing a real ${persona.role} putting an AI agent through its paces.${expertise}
|
|
3060
|
+
Your objective: ${persona.goal}
|
|
3061
|
+
You are deciding whether this agent's work is good enough to stake your professional reputation on. Assume it is not \u2014 until it proves otherwise.
|
|
3062
|
+
|
|
3063
|
+
${RIGOR_STANCE[rigor]}
|
|
3064
|
+
${productContext ? `Product context:
|
|
3065
|
+
${productContext}
|
|
3066
|
+
` : ""}Current workspace state:
|
|
3067
|
+
- Tasks: ${state.tasks} | Events: ${state.events}
|
|
3068
|
+
- Proposals: pending=${state.proposals.pending}, approved=${state.proposals.approved}, rejected=${state.proposals.rejected}
|
|
3069
|
+
- Vault files (${state.vaultFiles.length}): ${state.vaultFiles.slice(0, 10).join(", ")}${state.vaultFiles.length > 10 ? " \u2026" : ""}
|
|
3070
|
+
- Nominal task criteria: ${describeCompletion(persona, state)}
|
|
3071
|
+
${pressure}${curveballs}
|
|
3072
|
+
How to choose your next message:
|
|
3073
|
+
1. Silently judge the agent's last response the way a ${persona.role} would. Is every claim defended with a specific authority, figure, or mechanism? Or is it vague, hedged, or generic?
|
|
3074
|
+
2. If it is vague or hand-waved \u2014 do NOT move on. Name the gap and demand the specific authority / figure / mechanism. "It depends" is not an answer; force the decision.
|
|
3075
|
+
3. If it makes a claim you can challenge \u2014 challenge it. Make the agent defend or correct it.
|
|
3076
|
+
4. If it missed something a ${persona.role} would catch \u2014 press on exactly that, without naming it for the agent.
|
|
3077
|
+
5. If it is genuinely solid \u2014 escalate: go a layer deeper, or introduce a curveball.
|
|
3078
|
+
6. First message \u2014 state your situation as you really would: realistic, specific, with the messy detail, but do not coach the agent.
|
|
3079
|
+
|
|
3080
|
+
Sign-off: respond with exactly "DONE" only when a ${persona.role} would act on this work without redoing it. Nominal task completion is NOT sign-off \u2014 sloppy-but-complete still fails. If the agent never gets there, keep pushing; never sign off on weak work.
|
|
3081
|
+
|
|
3082
|
+
Output ONLY your next message to the agent \u2014 in character, first person, no meta-commentary, no stage directions.`;
|
|
3083
|
+
}
|
|
3062
3084
|
|
|
3063
3085
|
// src/integration-gates.ts
|
|
3064
3086
|
function integrationManifestValidatedPayload(input) {
|
|
@@ -4520,6 +4542,194 @@ function pathExists(obj, path) {
|
|
|
4520
4542
|
return true;
|
|
4521
4543
|
}
|
|
4522
4544
|
|
|
4545
|
+
// src/completion-verifier.ts
|
|
4546
|
+
var STOPWORDS = /* @__PURE__ */ new Set([
|
|
4547
|
+
"the",
|
|
4548
|
+
"a",
|
|
4549
|
+
"an",
|
|
4550
|
+
"of",
|
|
4551
|
+
"for",
|
|
4552
|
+
"and",
|
|
4553
|
+
"or",
|
|
4554
|
+
"to",
|
|
4555
|
+
"in",
|
|
4556
|
+
"on",
|
|
4557
|
+
"with",
|
|
4558
|
+
"by"
|
|
4559
|
+
]);
|
|
4560
|
+
var MATCH_THRESHOLD = 0.5;
|
|
4561
|
+
var MIN_CONTENT_CHARS = 50;
|
|
4562
|
+
function tokens(s) {
|
|
4563
|
+
return new Set(
|
|
4564
|
+
s.toLowerCase().split(/[^a-z0-9]+/).filter((t) => t.length > 1 && !STOPWORDS.has(t))
|
|
4565
|
+
);
|
|
4566
|
+
}
|
|
4567
|
+
function tokenRecall(requirementText, candidateText) {
|
|
4568
|
+
const req = tokens(requirementText);
|
|
4569
|
+
if (req.size === 0) return 0;
|
|
4570
|
+
const cand = tokens(candidateText);
|
|
4571
|
+
let hit = 0;
|
|
4572
|
+
for (const t of req) if (cand.has(t)) hit++;
|
|
4573
|
+
return hit / req.size;
|
|
4574
|
+
}
|
|
4575
|
+
function artifactCandidates(req, reqIndex, artifacts) {
|
|
4576
|
+
const reqText = `${req.title} ${req.category ?? ""}`;
|
|
4577
|
+
const out = [];
|
|
4578
|
+
artifacts.forEach((a, i) => {
|
|
4579
|
+
if ((a.content ?? "").trim().length < MIN_CONTENT_CHARS) return;
|
|
4580
|
+
let score = tokenRecall(reqText, `${a.path ?? ""} ${a.kind}`);
|
|
4581
|
+
if (req.category && a.kind && req.category.toLowerCase() === a.kind.toLowerCase()) {
|
|
4582
|
+
score = Math.max(score, 1);
|
|
4583
|
+
}
|
|
4584
|
+
if (score < MATCH_THRESHOLD) return;
|
|
4585
|
+
out.push({
|
|
4586
|
+
reqIndex,
|
|
4587
|
+
itemKey: `artifact:${i}`,
|
|
4588
|
+
score,
|
|
4589
|
+
evidence: `artifact '${a.path ?? a.kind}' matched (token recall ${score.toFixed(2)})`,
|
|
4590
|
+
content: a.content ?? null
|
|
4591
|
+
});
|
|
4592
|
+
});
|
|
4593
|
+
return out;
|
|
4594
|
+
}
|
|
4595
|
+
function proposalCandidates(req, reqIndex, proposals) {
|
|
4596
|
+
const reqText = `${req.title} ${req.category ?? ""}`;
|
|
4597
|
+
const out = [];
|
|
4598
|
+
for (const p of proposals) {
|
|
4599
|
+
if (p.status !== "approved") continue;
|
|
4600
|
+
const score = tokenRecall(reqText, p.title);
|
|
4601
|
+
if (score < MATCH_THRESHOLD) continue;
|
|
4602
|
+
const body = p.content ?? "";
|
|
4603
|
+
out.push({
|
|
4604
|
+
reqIndex,
|
|
4605
|
+
itemKey: `proposal:${p.id}`,
|
|
4606
|
+
score,
|
|
4607
|
+
evidence: `approved proposal '${p.title}' matched (token recall ${score.toFixed(2)})`,
|
|
4608
|
+
content: body.trim().length >= MIN_CONTENT_CHARS ? body : null
|
|
4609
|
+
});
|
|
4610
|
+
}
|
|
4611
|
+
return out;
|
|
4612
|
+
}
|
|
4613
|
+
function toolCallCandidates(req, reqIndex, toolCalls) {
|
|
4614
|
+
const out = [];
|
|
4615
|
+
toolCalls.forEach((name, i) => {
|
|
4616
|
+
const score = tokenRecall(req.title, name);
|
|
4617
|
+
if (score < MATCH_THRESHOLD) return;
|
|
4618
|
+
out.push({
|
|
4619
|
+
reqIndex,
|
|
4620
|
+
itemKey: `tool:${i}`,
|
|
4621
|
+
score,
|
|
4622
|
+
evidence: `tool call '${name}' matched (token recall ${score.toFixed(2)})`,
|
|
4623
|
+
content: null
|
|
4624
|
+
});
|
|
4625
|
+
});
|
|
4626
|
+
return out;
|
|
4627
|
+
}
|
|
4628
|
+
async function verifyCompletion(gold, state, checkCorrectness) {
|
|
4629
|
+
if (gold.requirements.length === 0) {
|
|
4630
|
+
throw new Error(
|
|
4631
|
+
`verifyCompletion: task '${gold.taskId}' has no requirements \u2014 malformed gold spec`
|
|
4632
|
+
);
|
|
4633
|
+
}
|
|
4634
|
+
const candidates = [];
|
|
4635
|
+
gold.requirements.forEach((req, i) => {
|
|
4636
|
+
const by = req.satisfiedBy ?? "any";
|
|
4637
|
+
if (by === "artifact" || by === "any") {
|
|
4638
|
+
candidates.push(...artifactCandidates(req, i, state.artifacts));
|
|
4639
|
+
}
|
|
4640
|
+
if (by === "proposal" || by === "any") {
|
|
4641
|
+
candidates.push(...proposalCandidates(req, i, state.proposals));
|
|
4642
|
+
}
|
|
4643
|
+
if (by === "tool-call" || by === "any") {
|
|
4644
|
+
candidates.push(...toolCallCandidates(req, i, state.toolCalls));
|
|
4645
|
+
}
|
|
4646
|
+
});
|
|
4647
|
+
candidates.sort((a, b) => b.score - a.score);
|
|
4648
|
+
const assigned = /* @__PURE__ */ new Map();
|
|
4649
|
+
const itemTaken = /* @__PURE__ */ new Set();
|
|
4650
|
+
for (const c of candidates) {
|
|
4651
|
+
if (assigned.has(c.reqIndex) || itemTaken.has(c.itemKey)) continue;
|
|
4652
|
+
assigned.set(c.reqIndex, c);
|
|
4653
|
+
itemTaken.add(c.itemKey);
|
|
4654
|
+
}
|
|
4655
|
+
const requirements = [];
|
|
4656
|
+
for (let i = 0; i < gold.requirements.length; i++) {
|
|
4657
|
+
const req = gold.requirements[i];
|
|
4658
|
+
const match = assigned.get(i);
|
|
4659
|
+
const evidence = [];
|
|
4660
|
+
let correct = null;
|
|
4661
|
+
if (match) {
|
|
4662
|
+
evidence.push(match.evidence);
|
|
4663
|
+
if (match.content !== null) {
|
|
4664
|
+
const r = await checkCorrectness(req, match.content);
|
|
4665
|
+
correct = r.correct;
|
|
4666
|
+
evidence.push(`correctness: ${r.correct ? "pass" : "fail"} \u2014 ${r.reason}`);
|
|
4667
|
+
} else {
|
|
4668
|
+
evidence.push("correctness: not assessed \u2014 matched item carries no content");
|
|
4669
|
+
}
|
|
4670
|
+
} else {
|
|
4671
|
+
const by = req.satisfiedBy ?? "any";
|
|
4672
|
+
const kind = by === "any" ? "artifact/proposal/tool-call" : by;
|
|
4673
|
+
evidence.push(`no produced ${kind} matched this requirement`);
|
|
4674
|
+
}
|
|
4675
|
+
const structurallyPresent = match !== void 0;
|
|
4676
|
+
const satisfied = structurallyPresent && correct !== false;
|
|
4677
|
+
requirements.push({
|
|
4678
|
+
reqId: req.reqId,
|
|
4679
|
+
title: req.title,
|
|
4680
|
+
structurallyPresent,
|
|
4681
|
+
correct,
|
|
4682
|
+
satisfied,
|
|
4683
|
+
evidence
|
|
4684
|
+
});
|
|
4685
|
+
}
|
|
4686
|
+
const satisfiedCount = requirements.filter((r) => r.satisfied).length;
|
|
4687
|
+
return {
|
|
4688
|
+
taskId: gold.taskId,
|
|
4689
|
+
requirements,
|
|
4690
|
+
completionRate: satisfiedCount / requirements.length,
|
|
4691
|
+
fullyComplete: satisfiedCount === requirements.length
|
|
4692
|
+
};
|
|
4693
|
+
}
|
|
4694
|
+
function parseCorrectnessResponse(raw) {
|
|
4695
|
+
const match = raw.match(/\{[\s\S]*\}/);
|
|
4696
|
+
if (!match) {
|
|
4697
|
+
throw new Error(`correctness checker: no JSON object in model response: ${raw.slice(0, 200)}`);
|
|
4698
|
+
}
|
|
4699
|
+
const parsed = JSON.parse(match[0]);
|
|
4700
|
+
if (typeof parsed.correct !== "boolean") {
|
|
4701
|
+
throw new Error(`correctness checker: 'correct' is not a boolean in: ${match[0].slice(0, 200)}`);
|
|
4702
|
+
}
|
|
4703
|
+
return { correct: parsed.correct, reason: typeof parsed.reason === "string" ? parsed.reason : "" };
|
|
4704
|
+
}
|
|
4705
|
+
function createLlmCorrectnessChecker(tc, opts = {}) {
|
|
4706
|
+
const model = opts.model ?? "claude-sonnet-4-6";
|
|
4707
|
+
const maxContentChars = opts.maxContentChars ?? 8e3;
|
|
4708
|
+
return async (requirement, content) => {
|
|
4709
|
+
const resp = await tc.chat({
|
|
4710
|
+
model,
|
|
4711
|
+
messages: [
|
|
4712
|
+
{
|
|
4713
|
+
role: "system",
|
|
4714
|
+
content: 'You verify whether a produced work artifact actually fulfils a stated requirement. Judge fulfilment only \u2014 is the deliverable substantively present and on-point \u2014 not polish. A plan to do it later, a vague gesture, or a description of what should be done does NOT fulfil a requirement; the artifact must BE the deliverable. Respond with a single JSON object: {"correct": boolean, "reason": string (<= 30 words)}.'
|
|
4715
|
+
},
|
|
4716
|
+
{
|
|
4717
|
+
role: "user",
|
|
4718
|
+
content: `Requirement: ${requirement.title}
|
|
4719
|
+
${requirement.category ? `Category: ${requirement.category}
|
|
4720
|
+
` : ""}
|
|
4721
|
+
Produced artifact:
|
|
4722
|
+
${content.slice(0, maxContentChars)}`
|
|
4723
|
+
}
|
|
4724
|
+
],
|
|
4725
|
+
temperature: 0,
|
|
4726
|
+
maxTokens: 200
|
|
4727
|
+
});
|
|
4728
|
+
const raw = resp.choices?.[0]?.message?.content ?? "";
|
|
4729
|
+
return parseCorrectnessResponse(raw);
|
|
4730
|
+
};
|
|
4731
|
+
}
|
|
4732
|
+
|
|
4523
4733
|
// src/dual-agent-bench.ts
|
|
4524
4734
|
var DualAgentBench = class {
|
|
4525
4735
|
async run(config) {
|
|
@@ -5174,6 +5384,40 @@ function canonicalInstruction(value) {
|
|
|
5174
5384
|
return normalized.length === 0 ? normalized : normalized[0].toUpperCase() + normalized.slice(1);
|
|
5175
5385
|
}
|
|
5176
5386
|
|
|
5387
|
+
// src/produced-state.ts
|
|
5388
|
+
function artifactKind(mimeType) {
|
|
5389
|
+
if (!mimeType) return "file";
|
|
5390
|
+
if (mimeType.includes("json")) return "json";
|
|
5391
|
+
if (mimeType.startsWith("text/")) return "text";
|
|
5392
|
+
return "file";
|
|
5393
|
+
}
|
|
5394
|
+
function extractProducedState(events) {
|
|
5395
|
+
const artifacts = [];
|
|
5396
|
+
const proposals = [];
|
|
5397
|
+
const toolCalls = [];
|
|
5398
|
+
const seenTools = /* @__PURE__ */ new Set();
|
|
5399
|
+
for (const ev of events) {
|
|
5400
|
+
if (ev.type === "tool_call") {
|
|
5401
|
+
const name = ev.toolName;
|
|
5402
|
+
if (name && !seenTools.has(name)) {
|
|
5403
|
+
seenTools.add(name);
|
|
5404
|
+
toolCalls.push(name);
|
|
5405
|
+
}
|
|
5406
|
+
} else if (ev.type === "artifact") {
|
|
5407
|
+
const a = ev;
|
|
5408
|
+
artifacts.push({
|
|
5409
|
+
kind: artifactKind(a.mimeType),
|
|
5410
|
+
path: a.name ?? a.uri ?? a.artifactId,
|
|
5411
|
+
content: a.content ?? ""
|
|
5412
|
+
});
|
|
5413
|
+
} else if (ev.type === "proposal_created") {
|
|
5414
|
+
const p = ev;
|
|
5415
|
+
proposals.push({ id: p.proposalId, title: p.title, status: p.status ?? "pending" });
|
|
5416
|
+
}
|
|
5417
|
+
}
|
|
5418
|
+
return { artifacts, proposals, toolCalls };
|
|
5419
|
+
}
|
|
5420
|
+
|
|
5177
5421
|
// src/prompt-registry.ts
|
|
5178
5422
|
var PromptRegistry = class {
|
|
5179
5423
|
entries = /* @__PURE__ */ new Map();
|
|
@@ -9092,8 +9336,8 @@ function ratio(numerator, denominator) {
|
|
|
9092
9336
|
return denominator > 0 ? numerator / denominator : 0;
|
|
9093
9337
|
}
|
|
9094
9338
|
function tokenJaccard(a, b) {
|
|
9095
|
-
const left = new Set(
|
|
9096
|
-
const right = new Set(
|
|
9339
|
+
const left = new Set(tokens2(a));
|
|
9340
|
+
const right = new Set(tokens2(b));
|
|
9097
9341
|
if (left.size === 0 || right.size === 0) return 0;
|
|
9098
9342
|
let intersection = 0;
|
|
9099
9343
|
for (const token of left) {
|
|
@@ -9111,7 +9355,7 @@ function tagOverlap(a, b) {
|
|
|
9111
9355
|
}
|
|
9112
9356
|
return intersection / Math.max(left.size, right.size);
|
|
9113
9357
|
}
|
|
9114
|
-
function
|
|
9358
|
+
function tokens2(text) {
|
|
9115
9359
|
return normalize(text).split(/\s+/).filter((token) => token.length >= 3 && !STOP_WORDS.has(token));
|
|
9116
9360
|
}
|
|
9117
9361
|
function normalize(text) {
|
|
@@ -10545,6 +10789,7 @@ export {
|
|
|
10545
10789
|
blockingKnowledgeEval,
|
|
10546
10790
|
bonferroni,
|
|
10547
10791
|
bootstrapCi,
|
|
10792
|
+
buildDriverSystemPrompt,
|
|
10548
10793
|
buildReflectionPrompt,
|
|
10549
10794
|
buildReviewerPrompt,
|
|
10550
10795
|
buildTraceAnalystTools,
|
|
@@ -10595,6 +10840,7 @@ export {
|
|
|
10595
10840
|
createFeedbackTrajectory,
|
|
10596
10841
|
createIntentMatchJudge,
|
|
10597
10842
|
createJudgeAdapter,
|
|
10843
|
+
createLlmCorrectnessChecker,
|
|
10598
10844
|
createLlmReviewer,
|
|
10599
10845
|
createReplayFetch,
|
|
10600
10846
|
createRunCriticAdapter,
|
|
@@ -10637,6 +10883,7 @@ export {
|
|
|
10637
10883
|
exportRunAsOtlp,
|
|
10638
10884
|
extractAssetUrls,
|
|
10639
10885
|
extractErrorCount,
|
|
10886
|
+
extractProducedState,
|
|
10640
10887
|
feedbackTrajectoriesToDatasetScenarios,
|
|
10641
10888
|
feedbackTrajectoriesToOptimizerRows,
|
|
10642
10889
|
feedbackTrajectoryToDatasetScenario,
|
|
@@ -10714,6 +10961,7 @@ export {
|
|
|
10714
10961
|
paretoChart,
|
|
10715
10962
|
paretoFrontier,
|
|
10716
10963
|
paretoFrontierWithCrowding,
|
|
10964
|
+
parseCorrectnessResponse,
|
|
10717
10965
|
parseFeedbackTrajectoriesJsonl,
|
|
10718
10966
|
parseFindingSubject,
|
|
10719
10967
|
parseRawFinding,
|
|
@@ -10825,6 +11073,7 @@ export {
|
|
|
10825
11073
|
userQuestionsForKnowledgeGaps,
|
|
10826
11074
|
validateRunRecord,
|
|
10827
11075
|
verbosityBias,
|
|
11076
|
+
verifyCompletion,
|
|
10828
11077
|
verifyManifest,
|
|
10829
11078
|
visualDiff,
|
|
10830
11079
|
viteDeployRunner,
|