@tangle-network/agent-eval 0.28.0 → 0.29.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +87 -0
- package/dist/index.d.ts +139 -105
- package/dist/index.js +142 -94
- package/dist/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -967,6 +967,45 @@ function resolveModel(req, defaultModel) {
|
|
|
967
967
|
return { ...req, model: defaultModel };
|
|
968
968
|
}
|
|
969
969
|
|
|
970
|
+
// src/analyst/finding-signature.ts
|
|
971
|
+
import { z } from "zod";
|
|
972
|
+
var ANALYST_SEVERITIES = ["critical", "high", "medium", "low", "info"];
|
|
973
|
+
var RawAnalystFindingSchema = z.object({
|
|
974
|
+
severity: z.enum(ANALYST_SEVERITIES),
|
|
975
|
+
claim: z.string().min(1).max(2e3),
|
|
976
|
+
subject: z.string().max(400).optional(),
|
|
977
|
+
evidence_uri: z.string().min(1).max(2e3),
|
|
978
|
+
evidence_excerpt: z.string().max(2e3).optional(),
|
|
979
|
+
confidence: z.number().min(0).max(1),
|
|
980
|
+
rationale: z.string().max(4e3).optional(),
|
|
981
|
+
recommended_action: z.string().max(2e3).optional()
|
|
982
|
+
}).strict();
|
|
983
|
+
var RAW_FINDING_SCHEMA_PROMPT = `Each finding MUST be a JSON object with these fields:
|
|
984
|
+
- severity: one of "critical" | "high" | "medium" | "low" | "info"
|
|
985
|
+
- claim: one-sentence statement (max 2000 chars)
|
|
986
|
+
- subject?: the leaf id, agent id, span id, tool name, or noun phrase the finding is about
|
|
987
|
+
- evidence_uri: "span://<trace_id>/<span_id>" for trace evidence, "artifact://<relative-path>" for files, "metric://<name>" for named scalars \u2014 ALWAYS cite a real id surfaced by the tools
|
|
988
|
+
- evidence_excerpt?: short quote (<=2000 chars) from the cited span/artifact
|
|
989
|
+
- confidence: number 0..1 \u2014 0.9+ when backed by exact quotes, 0.6-0.8 for inferred patterns, <0.5 for speculative
|
|
990
|
+
- rationale?: one or two sentences explaining the reasoning
|
|
991
|
+
- recommended_action?: concrete change phrased as an imperative ("Add ...", "Replace ...", "Stop ...") \u2014 omit when the finding is purely descriptive
|
|
992
|
+
|
|
993
|
+
Emit an empty array when the question has no findings to report. Do not fabricate evidence.`;
|
|
994
|
+
function parseRawFinding(row, log) {
|
|
995
|
+
const result = RawAnalystFindingSchema.safeParse(row);
|
|
996
|
+
if (!result.success) {
|
|
997
|
+
log?.("finding rejected: schema failure", {
|
|
998
|
+
issues: result.error.issues.map((i) => ({
|
|
999
|
+
path: i.path.join("."),
|
|
1000
|
+
code: i.code,
|
|
1001
|
+
message: i.message
|
|
1002
|
+
}))
|
|
1003
|
+
});
|
|
1004
|
+
return null;
|
|
1005
|
+
}
|
|
1006
|
+
return result.data;
|
|
1007
|
+
}
|
|
1008
|
+
|
|
970
1009
|
// src/analyst/findings-store.ts
|
|
971
1010
|
import { existsSync as existsSync2, readFileSync } from "fs";
|
|
972
1011
|
|
|
@@ -1113,43 +1152,8 @@ function diffFindings(previous, current, policy = {}) {
|
|
|
1113
1152
|
return { appeared, disappeared, persisted, changed };
|
|
1114
1153
|
}
|
|
1115
1154
|
|
|
1116
|
-
// src/analyst/finding-signature.ts
|
|
1117
|
-
import { z } from "zod";
|
|
1118
|
-
var ANALYST_SEVERITIES = ["critical", "high", "medium", "low", "info"];
|
|
1119
|
-
var RawAnalystFindingSchema = z.object({
|
|
1120
|
-
severity: z.enum(ANALYST_SEVERITIES),
|
|
1121
|
-
claim: z.string().min(1).max(2e3),
|
|
1122
|
-
subject: z.string().max(400).optional(),
|
|
1123
|
-
evidence_uri: z.string().min(1).max(2e3),
|
|
1124
|
-
evidence_excerpt: z.string().max(2e3).optional(),
|
|
1125
|
-
confidence: z.number().min(0).max(1),
|
|
1126
|
-
rationale: z.string().max(4e3).optional(),
|
|
1127
|
-
recommended_action: z.string().max(2e3).optional()
|
|
1128
|
-
}).strict();
|
|
1129
|
-
var RAW_FINDING_SCHEMA_PROMPT = `Each finding MUST be a JSON object with these fields:
|
|
1130
|
-
- severity: one of "critical" | "high" | "medium" | "low" | "info"
|
|
1131
|
-
- claim: one-sentence statement (max 2000 chars)
|
|
1132
|
-
- subject?: the leaf id, agent id, span id, tool name, or noun phrase the finding is about
|
|
1133
|
-
- evidence_uri: "span://<trace_id>/<span_id>" for trace evidence, "artifact://<relative-path>" for files, "metric://<name>" for named scalars \u2014 ALWAYS cite a real id surfaced by the tools
|
|
1134
|
-
- evidence_excerpt?: short quote (<=2000 chars) from the cited span/artifact
|
|
1135
|
-
- confidence: number 0..1 \u2014 0.9+ when backed by exact quotes, 0.6-0.8 for inferred patterns, <0.5 for speculative
|
|
1136
|
-
- rationale?: one or two sentences explaining the reasoning
|
|
1137
|
-
- recommended_action?: concrete change phrased as an imperative ("Add ...", "Replace ...", "Stop ...") \u2014 omit when the finding is purely descriptive
|
|
1138
|
-
|
|
1139
|
-
Emit an empty array when the question has no findings to report. Do not fabricate evidence.`;
|
|
1140
|
-
function parseRawFinding(row, log) {
|
|
1141
|
-
const result = RawAnalystFindingSchema.safeParse(row);
|
|
1142
|
-
if (!result.success) {
|
|
1143
|
-
log?.("finding rejected: schema failure", {
|
|
1144
|
-
issues: result.error.issues.map((i) => ({ path: i.path.join("."), code: i.code, message: i.message }))
|
|
1145
|
-
});
|
|
1146
|
-
return null;
|
|
1147
|
-
}
|
|
1148
|
-
return result.data;
|
|
1149
|
-
}
|
|
1150
|
-
|
|
1151
1155
|
// src/analyst/kind-factory.ts
|
|
1152
|
-
import {
|
|
1156
|
+
import { AxJSRuntime, agent } from "@ax-llm/ax";
|
|
1153
1157
|
function createTraceAnalystKind(spec, opts) {
|
|
1154
1158
|
const version = opts.versionSuffix ? `${spec.version}+${opts.versionSuffix}` : spec.version;
|
|
1155
1159
|
return {
|
|
@@ -1162,7 +1166,8 @@ function createTraceAnalystKind(spec, opts) {
|
|
|
1162
1166
|
const tools = spec.buildTools(store);
|
|
1163
1167
|
const maxDepth = spec.recursion?.maxDepth ?? 0;
|
|
1164
1168
|
const maxParallel = spec.recursion?.maxParallelSubagents ?? 2;
|
|
1165
|
-
const
|
|
1169
|
+
const priorContext = renderPriorFindings(ctx.priorFindings);
|
|
1170
|
+
const actorDescription = spec.actorDescription.trim() + priorContext + "\n\n" + RAW_FINDING_SCHEMA_PROMPT + "\n\nReturn the array in the `findings` output field. Use `final(...)` with the structured `{ findings }` payload when you are done.";
|
|
1166
1171
|
const ax = agent(
|
|
1167
1172
|
"question:string -> findings:json[]",
|
|
1168
1173
|
{
|
|
@@ -1257,12 +1262,41 @@ function evidenceKindFromUri(uri) {
|
|
|
1257
1262
|
if (uri.startsWith("finding://")) return "finding";
|
|
1258
1263
|
return "artifact";
|
|
1259
1264
|
}
|
|
1265
|
+
function renderPriorFindings(prior) {
|
|
1266
|
+
if (!prior || prior.length === 0) return "";
|
|
1267
|
+
const MAX_ROWS = 40;
|
|
1268
|
+
const rows = prior.slice(0, MAX_ROWS).map((f) => {
|
|
1269
|
+
const subject = f.subject ? ` [${f.subject}]` : "";
|
|
1270
|
+
return ` - id=${f.finding_id} ${f.severity}${subject} ${truncateForContext(f.claim, 160)}`;
|
|
1271
|
+
});
|
|
1272
|
+
const overflow = prior.length > MAX_ROWS ? `
|
|
1273
|
+
... +${prior.length - MAX_ROWS} more prior findings (older history truncated)` : "";
|
|
1274
|
+
return [
|
|
1275
|
+
"",
|
|
1276
|
+
"",
|
|
1277
|
+
"PRIOR FINDINGS (from a previous run on related data):",
|
|
1278
|
+
"When the work you do now matches a row below, REUSE the `finding_id` (pass it as `id_basis`) so the cross-run diff stays stable.",
|
|
1279
|
+
"A finding that reappears with no remediation evidence SHOULD raise its `confidence` and may justify a higher `severity`.",
|
|
1280
|
+
...rows,
|
|
1281
|
+
overflow
|
|
1282
|
+
].filter(Boolean).join("\n");
|
|
1283
|
+
}
|
|
1284
|
+
function truncateForContext(s, max) {
|
|
1285
|
+
if (s.length <= max) return s;
|
|
1286
|
+
return `${s.slice(0, max - 1).trimEnd()}\u2026`;
|
|
1287
|
+
}
|
|
1260
1288
|
|
|
1261
1289
|
// src/analyst/tool-groups.ts
|
|
1262
1290
|
var TOOL_NAMES_BY_GROUP = {
|
|
1263
1291
|
all: /* @__PURE__ */ new Set(),
|
|
1264
1292
|
discovery: /* @__PURE__ */ new Set(["getDatasetOverview", "queryTraces", "countTraces"]),
|
|
1265
|
-
discoveryAndRead: /* @__PURE__ */ new Set([
|
|
1293
|
+
discoveryAndRead: /* @__PURE__ */ new Set([
|
|
1294
|
+
"getDatasetOverview",
|
|
1295
|
+
"queryTraces",
|
|
1296
|
+
"countTraces",
|
|
1297
|
+
"viewTrace",
|
|
1298
|
+
"viewSpans"
|
|
1299
|
+
]),
|
|
1266
1300
|
discoveryAndSearch: /* @__PURE__ */ new Set([
|
|
1267
1301
|
"getDatasetOverview",
|
|
1268
1302
|
"queryTraces",
|
|
@@ -1323,8 +1357,59 @@ var FAILURE_MODE_KIND_SPEC = {
|
|
|
1323
1357
|
cost: { kind: "llm" }
|
|
1324
1358
|
};
|
|
1325
1359
|
|
|
1360
|
+
// src/analyst/kinds/improvement.ts
|
|
1361
|
+
var ACTOR_PROMPT2 = `You are a recursive-self-improvement analyst. Your job is to propose **concrete, locus-named edits** the agent's runtime should adopt to fix the failure modes, knowledge gaps, and poisonings present in this dataset.
|
|
1362
|
+
|
|
1363
|
+
Upstream analysts have already classified the problems. Your job is to convert each problem into a *change to make* and grade its expected leverage. Each finding is one proposed edit.
|
|
1364
|
+
|
|
1365
|
+
DISCOVERY \u2192 CANDIDATE-FIXES \u2192 COMPETE \u2192 CITE protocol:
|
|
1366
|
+
|
|
1367
|
+
1. \`traces.getDatasetOverview({})\` first. Note the agents, tools, and any system-prompt fingerprints (look for the prompt text echoed in early spans).
|
|
1368
|
+
2. For each high-severity failure pattern, generate 2-3 candidate fixes. Real candidate axes:
|
|
1369
|
+
- **System-prompt edit** \u2014 add an instruction, remove a misleading one, restructure precedence
|
|
1370
|
+
- **Tool description edit** \u2014 rewrite a tool's description so the agent picks it correctly / passes valid args
|
|
1371
|
+
- **New tool** \u2014 add a tool the agent kept emulating in code
|
|
1372
|
+
- **RAG ingestion** \u2014 add a document or correct a stale one
|
|
1373
|
+
- **Memory invalidation** \u2014 clear cached prior-run decisions that no longer apply
|
|
1374
|
+
- **Scaffolding** \u2014 add a precondition check, a retry policy, a turn budget, a verification step
|
|
1375
|
+
- **Output schema** \u2014 narrow the agent's output to forbid the failure shape
|
|
1376
|
+
3. **Compete candidate fixes via subagents.** For each failure cluster, spawn one \`llmQuery\` per candidate-fix axis you want to evaluate. Each subagent's job: simulate the fix on the cited traces and report (i) likely effect, (ii) side effects, (iii) implementation cost as small/medium/large. Pass the cluster's failing trace_ids and the candidate axis as context.
|
|
1377
|
+
4. After subagents return, **pick the winning candidate per cluster** based on (effect / cost) and emit ONE finding. Discard the losing candidates \u2014 the output is the recommendation, not the candidate set.
|
|
1378
|
+
5. **Cross-reference upstream findings.** If a finding cites a prior failure-mode or knowledge-gap finding, use \`evidence_uri = "finding://<prior-finding-id>"\` (the registry supports this kind). This builds the dependency graph that lets the dashboard show "fix #X resolves failure modes A, B, C."
|
|
1379
|
+
|
|
1380
|
+
For each winning recommendation, emit ONE finding with:
|
|
1381
|
+
- \`area\` = "improvement"
|
|
1382
|
+
- \`subject\` = the locus to edit: \`system-prompt:<section>\`, \`tool-doc:<tool-name>\`, \`new-tool:<proposed-name>\`, \`rag:<corpus>:<doc-id>\`, \`memory:<key>\`, \`scaffolding:<concern>\`, \`output-schema:<field>\`
|
|
1383
|
+
- \`claim\` = one sentence stating the edit ("Add a precondition check to refuse tool X calls without arg Y")
|
|
1384
|
+
- \`severity\` = leverage rating: "critical" when fix resolves a critical failure mode; "high" when it resolves a high; "medium" when it's a quality-of-life win; "info" when it's a cleanup with no behavioral effect
|
|
1385
|
+
- \`evidence_uri\` = the failure-mode finding id this fix targets (\`finding://<id>\`) when it exists; else the most representative span
|
|
1386
|
+
- \`evidence_excerpt\` = a fragment showing the problem the fix targets
|
|
1387
|
+
- \`confidence\` = 0.85+ when the fix is mechanical and the failure mode is well-evidenced; 0.6-0.8 when the fix requires judgment; <0.5 for speculative
|
|
1388
|
+
- \`rationale\` = why this candidate beat its alternatives (2 sentences max)
|
|
1389
|
+
- \`recommended_action\` = the **literal edit**, phrased as a diff or a quoted replacement: "Replace section X with: '...'" or "Add tool with description: '...'" or "Set retry policy to max_attempts=3 with exponential backoff"
|
|
1390
|
+
|
|
1391
|
+
If no upstream failure findings exist in this run, derive your own from the trace dataset using the failure-mode protocol inline (\`searchTrace\` for STATUS_CODE_ERROR / MaxTurnsExceeded / etc.). But prefer to consume upstream findings when present \u2014 the kinds are designed to chain.
|
|
1392
|
+
|
|
1393
|
+
Do NOT propose a fix you cannot defend with evidence. "Tighten the prompt" is not a finding; "Add 'When the user asks for X, always Y' to the system prompt section "request-classification"" is.
|
|
1394
|
+
|
|
1395
|
+
OBSERVABILITY rules:
|
|
1396
|
+
- Each non-final turn must emit at least one \`console.log\` for evidence.
|
|
1397
|
+
- Call \`final({ findings: [...] })\` exactly once at the top level.`;
|
|
1398
|
+
var IMPROVEMENT_KIND_SPEC = {
|
|
1399
|
+
id: "improvement",
|
|
1400
|
+
description: "Converts upstream failure / gap / poisoning findings into concrete locus-named edits (prompt, tool-doc, RAG, scaffolding) with leverage grades.",
|
|
1401
|
+
area: "improvement",
|
|
1402
|
+
version: "1.0.0",
|
|
1403
|
+
actorDescription: ACTOR_PROMPT2,
|
|
1404
|
+
buildTools: (store) => buildTraceToolsForGroup("all", store),
|
|
1405
|
+
recursion: { maxDepth: 3, maxParallelSubagents: 4 },
|
|
1406
|
+
maxTurns: 30,
|
|
1407
|
+
maxRuntimeChars: 12e3,
|
|
1408
|
+
cost: { kind: "llm" }
|
|
1409
|
+
};
|
|
1410
|
+
|
|
1326
1411
|
// src/analyst/kinds/knowledge-gap.ts
|
|
1327
|
-
var
|
|
1412
|
+
var ACTOR_PROMPT3 = `You are a knowledge-gap analyst for an OTLP trace dataset. Your job is to identify the **specific pieces of information the agent lacked, or that were stale**, that caused poor decisions.
|
|
1328
1413
|
|
|
1329
1414
|
The agent under analysis maintains a curated knowledge base via \`@tangle-network/agent-knowledge\` \u2014 a wiki of \`KnowledgePage\`s with raw source anchors, claims, and relations. The primary expected store of agent-knowable facts IS that wiki. A "knowledge gap" is anything the agent had to discover or guess at run-time that the wiki should have held \u2014 or an outdated/contradictory fact the agent picked up from a non-wiki source.
|
|
1330
1415
|
|
|
@@ -1372,7 +1457,7 @@ var KNOWLEDGE_GAP_KIND_SPEC = {
|
|
|
1372
1457
|
description: "Identifies missing or stale pieces of knowledge \u2014 primarily against the agent-knowledge wiki \u2014 and attributes each to the runtime layer (wiki page, claim, raw source, websearch, tool-doc, system-prompt, memory) that should have held it.",
|
|
1373
1458
|
area: "knowledge-gap",
|
|
1374
1459
|
version: "1.0.0",
|
|
1375
|
-
actorDescription:
|
|
1460
|
+
actorDescription: ACTOR_PROMPT3,
|
|
1376
1461
|
buildTools: (store) => buildTraceToolsForGroup("discoveryAndSearch", store),
|
|
1377
1462
|
recursion: { maxDepth: 2, maxParallelSubagents: 4 },
|
|
1378
1463
|
maxTurns: 18,
|
|
@@ -1380,7 +1465,7 @@ var KNOWLEDGE_GAP_KIND_SPEC = {
|
|
|
1380
1465
|
};
|
|
1381
1466
|
|
|
1382
1467
|
// src/analyst/kinds/knowledge-poisoning.ts
|
|
1383
|
-
var
|
|
1468
|
+
var ACTOR_PROMPT4 = `You are a knowledge-poisoning analyst for an OTLP trace dataset. Your job is to identify cases where the agent **confidently used wrong information** \u2014 not where it lacked information (that's the knowledge-gap analyst).
|
|
1384
1469
|
|
|
1385
1470
|
DISCOVERY \u2192 DUAL-VERIFY \u2192 CITE protocol:
|
|
1386
1471
|
|
|
@@ -1420,64 +1505,13 @@ var KNOWLEDGE_POISONING_KIND_SPEC = {
|
|
|
1420
1505
|
description: "Identifies confident-but-wrong actions caused by stale memory, contradicting RAG, deprecated tool docs, or outdated system-prompt instructions.",
|
|
1421
1506
|
area: "knowledge-poisoning",
|
|
1422
1507
|
version: "1.0.0",
|
|
1423
|
-
actorDescription:
|
|
1508
|
+
actorDescription: ACTOR_PROMPT4,
|
|
1424
1509
|
buildTools: (store) => buildTraceToolsForGroup("all", store),
|
|
1425
1510
|
recursion: { maxDepth: 2, maxParallelSubagents: 4 },
|
|
1426
1511
|
maxTurns: 20,
|
|
1427
1512
|
cost: { kind: "llm" }
|
|
1428
1513
|
};
|
|
1429
1514
|
|
|
1430
|
-
// src/analyst/kinds/improvement.ts
|
|
1431
|
-
var ACTOR_PROMPT4 = `You are a recursive-self-improvement analyst. Your job is to propose **concrete, locus-named edits** the agent's runtime should adopt to fix the failure modes, knowledge gaps, and poisonings present in this dataset.
|
|
1432
|
-
|
|
1433
|
-
Upstream analysts have already classified the problems. Your job is to convert each problem into a *change to make* and grade its expected leverage. Each finding is one proposed edit.
|
|
1434
|
-
|
|
1435
|
-
DISCOVERY \u2192 CANDIDATE-FIXES \u2192 COMPETE \u2192 CITE protocol:
|
|
1436
|
-
|
|
1437
|
-
1. \`traces.getDatasetOverview({})\` first. Note the agents, tools, and any system-prompt fingerprints (look for the prompt text echoed in early spans).
|
|
1438
|
-
2. For each high-severity failure pattern, generate 2-3 candidate fixes. Real candidate axes:
|
|
1439
|
-
- **System-prompt edit** \u2014 add an instruction, remove a misleading one, restructure precedence
|
|
1440
|
-
- **Tool description edit** \u2014 rewrite a tool's description so the agent picks it correctly / passes valid args
|
|
1441
|
-
- **New tool** \u2014 add a tool the agent kept emulating in code
|
|
1442
|
-
- **RAG ingestion** \u2014 add a document or correct a stale one
|
|
1443
|
-
- **Memory invalidation** \u2014 clear cached prior-run decisions that no longer apply
|
|
1444
|
-
- **Scaffolding** \u2014 add a precondition check, a retry policy, a turn budget, a verification step
|
|
1445
|
-
- **Output schema** \u2014 narrow the agent's output to forbid the failure shape
|
|
1446
|
-
3. **Compete candidate fixes via subagents.** For each failure cluster, spawn one \`llmQuery\` per candidate-fix axis you want to evaluate. Each subagent's job: simulate the fix on the cited traces and report (i) likely effect, (ii) side effects, (iii) implementation cost as small/medium/large. Pass the cluster's failing trace_ids and the candidate axis as context.
|
|
1447
|
-
4. After subagents return, **pick the winning candidate per cluster** based on (effect / cost) and emit ONE finding. Discard the losing candidates \u2014 the output is the recommendation, not the candidate set.
|
|
1448
|
-
5. **Cross-reference upstream findings.** If a finding cites a prior failure-mode or knowledge-gap finding, use \`evidence_uri = "finding://<prior-finding-id>"\` (the registry supports this kind). This builds the dependency graph that lets the dashboard show "fix #X resolves failure modes A, B, C."
|
|
1449
|
-
|
|
1450
|
-
For each winning recommendation, emit ONE finding with:
|
|
1451
|
-
- \`area\` = "improvement"
|
|
1452
|
-
- \`subject\` = the locus to edit: \`system-prompt:<section>\`, \`tool-doc:<tool-name>\`, \`new-tool:<proposed-name>\`, \`rag:<corpus>:<doc-id>\`, \`memory:<key>\`, \`scaffolding:<concern>\`, \`output-schema:<field>\`
|
|
1453
|
-
- \`claim\` = one sentence stating the edit ("Add a precondition check to refuse tool X calls without arg Y")
|
|
1454
|
-
- \`severity\` = leverage rating: "critical" when fix resolves a critical failure mode; "high" when it resolves a high; "medium" when it's a quality-of-life win; "info" when it's a cleanup with no behavioral effect
|
|
1455
|
-
- \`evidence_uri\` = the failure-mode finding id this fix targets (\`finding://<id>\`) when it exists; else the most representative span
|
|
1456
|
-
- \`evidence_excerpt\` = a fragment showing the problem the fix targets
|
|
1457
|
-
- \`confidence\` = 0.85+ when the fix is mechanical and the failure mode is well-evidenced; 0.6-0.8 when the fix requires judgment; <0.5 for speculative
|
|
1458
|
-
- \`rationale\` = why this candidate beat its alternatives (2 sentences max)
|
|
1459
|
-
- \`recommended_action\` = the **literal edit**, phrased as a diff or a quoted replacement: "Replace section X with: '...'" or "Add tool with description: '...'" or "Set retry policy to max_attempts=3 with exponential backoff"
|
|
1460
|
-
|
|
1461
|
-
If no upstream failure findings exist in this run, derive your own from the trace dataset using the failure-mode protocol inline (\`searchTrace\` for STATUS_CODE_ERROR / MaxTurnsExceeded / etc.). But prefer to consume upstream findings when present \u2014 the kinds are designed to chain.
|
|
1462
|
-
|
|
1463
|
-
Do NOT propose a fix you cannot defend with evidence. "Tighten the prompt" is not a finding; "Add 'When the user asks for X, always Y' to the system prompt section "request-classification"" is.
|
|
1464
|
-
|
|
1465
|
-
OBSERVABILITY rules:
|
|
1466
|
-
- Each non-final turn must emit at least one \`console.log\` for evidence.
|
|
1467
|
-
- Call \`final({ findings: [...] })\` exactly once at the top level.`;
|
|
1468
|
-
var IMPROVEMENT_KIND_SPEC = {
|
|
1469
|
-
id: "improvement",
|
|
1470
|
-
description: "Converts upstream failure / gap / poisoning findings into concrete locus-named edits (prompt, tool-doc, RAG, scaffolding) with leverage grades.",
|
|
1471
|
-
area: "improvement",
|
|
1472
|
-
version: "1.0.0",
|
|
1473
|
-
actorDescription: ACTOR_PROMPT4,
|
|
1474
|
-
buildTools: (store) => buildTraceToolsForGroup("all", store),
|
|
1475
|
-
recursion: { maxDepth: 3, maxParallelSubagents: 4 },
|
|
1476
|
-
maxTurns: 30,
|
|
1477
|
-
maxRuntimeChars: 12e3,
|
|
1478
|
-
cost: { kind: "llm" }
|
|
1479
|
-
};
|
|
1480
|
-
|
|
1481
1515
|
// src/analyst/kinds/index.ts
|
|
1482
1516
|
var DEFAULT_TRACE_ANALYST_KINDS = [
|
|
1483
1517
|
FAILURE_MODE_KIND_SPEC,
|
|
@@ -1556,7 +1590,8 @@ var AnalystRegistry = class {
|
|
|
1556
1590
|
chat: this.options.chat,
|
|
1557
1591
|
tags: runOpts.tags,
|
|
1558
1592
|
log: (msg, fields) => log(`[${analyst.id}] ${msg}`, { runId, correlationId, ...fields }),
|
|
1559
|
-
signal: runOpts.signal
|
|
1593
|
+
signal: runOpts.signal,
|
|
1594
|
+
priorFindings: selectPriorFindings(runOpts.priorFindings, analyst.id)
|
|
1560
1595
|
};
|
|
1561
1596
|
await hooks.onBeforeAnalyze?.({ analyst, ctx, runId });
|
|
1562
1597
|
try {
|
|
@@ -1670,6 +1705,18 @@ function sumFindingCost(findings) {
|
|
|
1670
1705
|
}
|
|
1671
1706
|
return sum2;
|
|
1672
1707
|
}
|
|
1708
|
+
function selectPriorFindings(source, analystId) {
|
|
1709
|
+
if (!source) return void 0;
|
|
1710
|
+
if (Array.isArray(source)) {
|
|
1711
|
+
const own2 = source.filter((f) => f.analyst_id === analystId);
|
|
1712
|
+
return own2.length > 0 ? own2 : void 0;
|
|
1713
|
+
}
|
|
1714
|
+
const record = source;
|
|
1715
|
+
const own = record[analystId] ?? [];
|
|
1716
|
+
const wildcard = record["*"] ?? [];
|
|
1717
|
+
const merged = [...own, ...wildcard];
|
|
1718
|
+
return merged.length > 0 ? merged : void 0;
|
|
1719
|
+
}
|
|
1673
1720
|
|
|
1674
1721
|
// src/auto-pr.ts
|
|
1675
1722
|
async function proposeAutomatedPullRequest(client, input) {
|
|
@@ -10320,6 +10367,7 @@ export {
|
|
|
10320
10367
|
renderMarkdownReport,
|
|
10321
10368
|
renderPlaybookMarkdown,
|
|
10322
10369
|
renderPreferenceMemoryMarkdown,
|
|
10370
|
+
renderPriorFindings,
|
|
10323
10371
|
renderReleaseReport,
|
|
10324
10372
|
renderSteeringText,
|
|
10325
10373
|
replayFeedbackTrajectories,
|