@tangle-network/agent-eval 0.28.0 → 0.29.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +87 -0
- package/dist/index.d.ts +187 -105
- package/dist/index.js +175 -95
- package/dist/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -967,6 +967,45 @@ function resolveModel(req, defaultModel) {
|
|
|
967
967
|
return { ...req, model: defaultModel };
|
|
968
968
|
}
|
|
969
969
|
|
|
970
|
+
// src/analyst/finding-signature.ts
|
|
971
|
+
import { z } from "zod";
|
|
972
|
+
var ANALYST_SEVERITIES = ["critical", "high", "medium", "low", "info"];
|
|
973
|
+
var RawAnalystFindingSchema = z.object({
|
|
974
|
+
severity: z.enum(ANALYST_SEVERITIES),
|
|
975
|
+
claim: z.string().min(1).max(2e3),
|
|
976
|
+
subject: z.string().max(400).optional(),
|
|
977
|
+
evidence_uri: z.string().min(1).max(2e3),
|
|
978
|
+
evidence_excerpt: z.string().max(2e3).optional(),
|
|
979
|
+
confidence: z.number().min(0).max(1),
|
|
980
|
+
rationale: z.string().max(4e3).optional(),
|
|
981
|
+
recommended_action: z.string().max(2e3).optional()
|
|
982
|
+
}).strict();
|
|
983
|
+
var RAW_FINDING_SCHEMA_PROMPT = `Each finding MUST be a JSON object with these fields:
|
|
984
|
+
- severity: one of "critical" | "high" | "medium" | "low" | "info"
|
|
985
|
+
- claim: one-sentence statement (max 2000 chars)
|
|
986
|
+
- subject?: the leaf id, agent id, span id, tool name, or noun phrase the finding is about
|
|
987
|
+
- evidence_uri: "span://<trace_id>/<span_id>" for trace evidence, "artifact://<relative-path>" for files, "metric://<name>" for named scalars \u2014 ALWAYS cite a real id surfaced by the tools
|
|
988
|
+
- evidence_excerpt?: short quote (<=2000 chars) from the cited span/artifact
|
|
989
|
+
- confidence: number 0..1 \u2014 0.9+ when backed by exact quotes, 0.6-0.8 for inferred patterns, <0.5 for speculative
|
|
990
|
+
- rationale?: one or two sentences explaining the reasoning
|
|
991
|
+
- recommended_action?: concrete change phrased as an imperative ("Add ...", "Replace ...", "Stop ...") \u2014 omit when the finding is purely descriptive
|
|
992
|
+
|
|
993
|
+
Emit an empty array when the question has no findings to report. Do not fabricate evidence.`;
|
|
994
|
+
function parseRawFinding(row, log) {
|
|
995
|
+
const result = RawAnalystFindingSchema.safeParse(row);
|
|
996
|
+
if (!result.success) {
|
|
997
|
+
log?.("finding rejected: schema failure", {
|
|
998
|
+
issues: result.error.issues.map((i) => ({
|
|
999
|
+
path: i.path.join("."),
|
|
1000
|
+
code: i.code,
|
|
1001
|
+
message: i.message
|
|
1002
|
+
}))
|
|
1003
|
+
});
|
|
1004
|
+
return null;
|
|
1005
|
+
}
|
|
1006
|
+
return result.data;
|
|
1007
|
+
}
|
|
1008
|
+
|
|
970
1009
|
// src/analyst/findings-store.ts
|
|
971
1010
|
import { existsSync as existsSync2, readFileSync } from "fs";
|
|
972
1011
|
|
|
@@ -1113,43 +1152,8 @@ function diffFindings(previous, current, policy = {}) {
|
|
|
1113
1152
|
return { appeared, disappeared, persisted, changed };
|
|
1114
1153
|
}
|
|
1115
1154
|
|
|
1116
|
-
// src/analyst/finding-signature.ts
|
|
1117
|
-
import { z } from "zod";
|
|
1118
|
-
var ANALYST_SEVERITIES = ["critical", "high", "medium", "low", "info"];
|
|
1119
|
-
var RawAnalystFindingSchema = z.object({
|
|
1120
|
-
severity: z.enum(ANALYST_SEVERITIES),
|
|
1121
|
-
claim: z.string().min(1).max(2e3),
|
|
1122
|
-
subject: z.string().max(400).optional(),
|
|
1123
|
-
evidence_uri: z.string().min(1).max(2e3),
|
|
1124
|
-
evidence_excerpt: z.string().max(2e3).optional(),
|
|
1125
|
-
confidence: z.number().min(0).max(1),
|
|
1126
|
-
rationale: z.string().max(4e3).optional(),
|
|
1127
|
-
recommended_action: z.string().max(2e3).optional()
|
|
1128
|
-
}).strict();
|
|
1129
|
-
var RAW_FINDING_SCHEMA_PROMPT = `Each finding MUST be a JSON object with these fields:
|
|
1130
|
-
- severity: one of "critical" | "high" | "medium" | "low" | "info"
|
|
1131
|
-
- claim: one-sentence statement (max 2000 chars)
|
|
1132
|
-
- subject?: the leaf id, agent id, span id, tool name, or noun phrase the finding is about
|
|
1133
|
-
- evidence_uri: "span://<trace_id>/<span_id>" for trace evidence, "artifact://<relative-path>" for files, "metric://<name>" for named scalars \u2014 ALWAYS cite a real id surfaced by the tools
|
|
1134
|
-
- evidence_excerpt?: short quote (<=2000 chars) from the cited span/artifact
|
|
1135
|
-
- confidence: number 0..1 \u2014 0.9+ when backed by exact quotes, 0.6-0.8 for inferred patterns, <0.5 for speculative
|
|
1136
|
-
- rationale?: one or two sentences explaining the reasoning
|
|
1137
|
-
- recommended_action?: concrete change phrased as an imperative ("Add ...", "Replace ...", "Stop ...") \u2014 omit when the finding is purely descriptive
|
|
1138
|
-
|
|
1139
|
-
Emit an empty array when the question has no findings to report. Do not fabricate evidence.`;
|
|
1140
|
-
function parseRawFinding(row, log) {
|
|
1141
|
-
const result = RawAnalystFindingSchema.safeParse(row);
|
|
1142
|
-
if (!result.success) {
|
|
1143
|
-
log?.("finding rejected: schema failure", {
|
|
1144
|
-
issues: result.error.issues.map((i) => ({ path: i.path.join("."), code: i.code, message: i.message }))
|
|
1145
|
-
});
|
|
1146
|
-
return null;
|
|
1147
|
-
}
|
|
1148
|
-
return result.data;
|
|
1149
|
-
}
|
|
1150
|
-
|
|
1151
1155
|
// src/analyst/kind-factory.ts
|
|
1152
|
-
import {
|
|
1156
|
+
import { AxJSRuntime, agent } from "@ax-llm/ax";
|
|
1153
1157
|
function createTraceAnalystKind(spec, opts) {
|
|
1154
1158
|
const version = opts.versionSuffix ? `${spec.version}+${opts.versionSuffix}` : spec.version;
|
|
1155
1159
|
return {
|
|
@@ -1162,7 +1166,8 @@ function createTraceAnalystKind(spec, opts) {
|
|
|
1162
1166
|
const tools = spec.buildTools(store);
|
|
1163
1167
|
const maxDepth = spec.recursion?.maxDepth ?? 0;
|
|
1164
1168
|
const maxParallel = spec.recursion?.maxParallelSubagents ?? 2;
|
|
1165
|
-
const
|
|
1169
|
+
const priorContext = renderPriorFindings(ctx.priorFindings);
|
|
1170
|
+
const actorDescription = spec.actorDescription.trim() + priorContext + "\n\n" + RAW_FINDING_SCHEMA_PROMPT + "\n\nReturn the array in the `findings` output field. Use `final(...)` with the structured `{ findings }` payload when you are done.";
|
|
1166
1171
|
const ax = agent(
|
|
1167
1172
|
"question:string -> findings:json[]",
|
|
1168
1173
|
{
|
|
@@ -1257,12 +1262,41 @@ function evidenceKindFromUri(uri) {
|
|
|
1257
1262
|
if (uri.startsWith("finding://")) return "finding";
|
|
1258
1263
|
return "artifact";
|
|
1259
1264
|
}
|
|
1265
|
+
function renderPriorFindings(prior) {
|
|
1266
|
+
if (!prior || prior.length === 0) return "";
|
|
1267
|
+
const MAX_ROWS = 40;
|
|
1268
|
+
const rows = prior.slice(0, MAX_ROWS).map((f) => {
|
|
1269
|
+
const subject = f.subject ? ` [${f.subject}]` : "";
|
|
1270
|
+
return ` - id=${f.finding_id} ${f.severity}${subject} ${truncateForContext(f.claim, 160)}`;
|
|
1271
|
+
});
|
|
1272
|
+
const overflow = prior.length > MAX_ROWS ? `
|
|
1273
|
+
... +${prior.length - MAX_ROWS} more prior findings (older history truncated)` : "";
|
|
1274
|
+
return [
|
|
1275
|
+
"",
|
|
1276
|
+
"",
|
|
1277
|
+
"PRIOR FINDINGS (from a previous run on related data):",
|
|
1278
|
+
"When the work you do now matches a row below, REUSE the `finding_id` (pass it as `id_basis`) so the cross-run diff stays stable.",
|
|
1279
|
+
"A finding that reappears with no remediation evidence SHOULD raise its `confidence` and may justify a higher `severity`.",
|
|
1280
|
+
...rows,
|
|
1281
|
+
overflow
|
|
1282
|
+
].filter(Boolean).join("\n");
|
|
1283
|
+
}
|
|
1284
|
+
function truncateForContext(s, max) {
|
|
1285
|
+
if (s.length <= max) return s;
|
|
1286
|
+
return `${s.slice(0, max - 1).trimEnd()}\u2026`;
|
|
1287
|
+
}
|
|
1260
1288
|
|
|
1261
1289
|
// src/analyst/tool-groups.ts
|
|
1262
1290
|
var TOOL_NAMES_BY_GROUP = {
|
|
1263
1291
|
all: /* @__PURE__ */ new Set(),
|
|
1264
1292
|
discovery: /* @__PURE__ */ new Set(["getDatasetOverview", "queryTraces", "countTraces"]),
|
|
1265
|
-
discoveryAndRead: /* @__PURE__ */ new Set([
|
|
1293
|
+
discoveryAndRead: /* @__PURE__ */ new Set([
|
|
1294
|
+
"getDatasetOverview",
|
|
1295
|
+
"queryTraces",
|
|
1296
|
+
"countTraces",
|
|
1297
|
+
"viewTrace",
|
|
1298
|
+
"viewSpans"
|
|
1299
|
+
]),
|
|
1266
1300
|
discoveryAndSearch: /* @__PURE__ */ new Set([
|
|
1267
1301
|
"getDatasetOverview",
|
|
1268
1302
|
"queryTraces",
|
|
@@ -1323,8 +1357,59 @@ var FAILURE_MODE_KIND_SPEC = {
|
|
|
1323
1357
|
cost: { kind: "llm" }
|
|
1324
1358
|
};
|
|
1325
1359
|
|
|
1360
|
+
// src/analyst/kinds/improvement.ts
|
|
1361
|
+
var ACTOR_PROMPT2 = `You are a recursive-self-improvement analyst. Your job is to propose **concrete, locus-named edits** the agent's runtime should adopt to fix the failure modes, knowledge gaps, and poisonings present in this dataset.
|
|
1362
|
+
|
|
1363
|
+
Upstream analysts have already classified the problems. Your job is to convert each problem into a *change to make* and grade its expected leverage. Each finding is one proposed edit.
|
|
1364
|
+
|
|
1365
|
+
DISCOVERY \u2192 CANDIDATE-FIXES \u2192 COMPETE \u2192 CITE protocol:
|
|
1366
|
+
|
|
1367
|
+
1. \`traces.getDatasetOverview({})\` first. Note the agents, tools, and any system-prompt fingerprints (look for the prompt text echoed in early spans).
|
|
1368
|
+
2. For each high-severity failure pattern, generate 2-3 candidate fixes. Real candidate axes:
|
|
1369
|
+
- **System-prompt edit** \u2014 add an instruction, remove a misleading one, restructure precedence
|
|
1370
|
+
- **Tool description edit** \u2014 rewrite a tool's description so the agent picks it correctly / passes valid args
|
|
1371
|
+
- **New tool** \u2014 add a tool the agent kept emulating in code
|
|
1372
|
+
- **RAG ingestion** \u2014 add a document or correct a stale one
|
|
1373
|
+
- **Memory invalidation** \u2014 clear cached prior-run decisions that no longer apply
|
|
1374
|
+
- **Scaffolding** \u2014 add a precondition check, a retry policy, a turn budget, a verification step
|
|
1375
|
+
- **Output schema** \u2014 narrow the agent's output to forbid the failure shape
|
|
1376
|
+
3. **Compete candidate fixes via subagents.** For each failure cluster, spawn one \`llmQuery\` per candidate-fix axis you want to evaluate. Each subagent's job: simulate the fix on the cited traces and report (i) likely effect, (ii) side effects, (iii) implementation cost as small/medium/large. Pass the cluster's failing trace_ids and the candidate axis as context.
|
|
1377
|
+
4. After subagents return, **pick the winning candidate per cluster** based on (effect / cost) and emit ONE finding. Discard the losing candidates \u2014 the output is the recommendation, not the candidate set.
|
|
1378
|
+
5. **Cross-reference upstream findings.** If a finding cites a prior failure-mode or knowledge-gap finding, use \`evidence_uri = "finding://<prior-finding-id>"\` (the registry supports this kind). This builds the dependency graph that lets the dashboard show "fix #X resolves failure modes A, B, C."
|
|
1379
|
+
|
|
1380
|
+
For each winning recommendation, emit ONE finding with:
|
|
1381
|
+
- \`area\` = "improvement"
|
|
1382
|
+
- \`subject\` = the locus to edit: \`system-prompt:<section>\`, \`tool-doc:<tool-name>\`, \`new-tool:<proposed-name>\`, \`rag:<corpus>:<doc-id>\`, \`memory:<key>\`, \`scaffolding:<concern>\`, \`output-schema:<field>\`
|
|
1383
|
+
- \`claim\` = one sentence stating the edit ("Add a precondition check to refuse tool X calls without arg Y")
|
|
1384
|
+
- \`severity\` = leverage rating: "critical" when fix resolves a critical failure mode; "high" when it resolves a high; "medium" when it's a quality-of-life win; "info" when it's a cleanup with no behavioral effect
|
|
1385
|
+
- \`evidence_uri\` = the failure-mode finding id this fix targets (\`finding://<id>\`) when it exists; else the most representative span
|
|
1386
|
+
- \`evidence_excerpt\` = a fragment showing the problem the fix targets
|
|
1387
|
+
- \`confidence\` = 0.85+ when the fix is mechanical and the failure mode is well-evidenced; 0.6-0.8 when the fix requires judgment; <0.5 for speculative
|
|
1388
|
+
- \`rationale\` = why this candidate beat its alternatives (2 sentences max)
|
|
1389
|
+
- \`recommended_action\` = the **literal edit**, phrased as a diff or a quoted replacement: "Replace section X with: '...'" or "Add tool with description: '...'" or "Set retry policy to max_attempts=3 with exponential backoff"
|
|
1390
|
+
|
|
1391
|
+
If no upstream failure findings exist in this run, derive your own from the trace dataset using the failure-mode protocol inline (\`searchTrace\` for STATUS_CODE_ERROR / MaxTurnsExceeded / etc.). But prefer to consume upstream findings when present \u2014 the kinds are designed to chain.
|
|
1392
|
+
|
|
1393
|
+
Do NOT propose a fix you cannot defend with evidence. "Tighten the prompt" is not a finding; "Add 'When the user asks for X, always Y' to the system prompt section "request-classification"" is.
|
|
1394
|
+
|
|
1395
|
+
OBSERVABILITY rules:
|
|
1396
|
+
- Each non-final turn must emit at least one \`console.log\` for evidence.
|
|
1397
|
+
- Call \`final({ findings: [...] })\` exactly once at the top level.`;
|
|
1398
|
+
var IMPROVEMENT_KIND_SPEC = {
|
|
1399
|
+
id: "improvement",
|
|
1400
|
+
description: "Converts upstream failure / gap / poisoning findings into concrete locus-named edits (prompt, tool-doc, RAG, scaffolding) with leverage grades.",
|
|
1401
|
+
area: "improvement",
|
|
1402
|
+
version: "1.0.0",
|
|
1403
|
+
actorDescription: ACTOR_PROMPT2,
|
|
1404
|
+
buildTools: (store) => buildTraceToolsForGroup("all", store),
|
|
1405
|
+
recursion: { maxDepth: 3, maxParallelSubagents: 4 },
|
|
1406
|
+
maxTurns: 30,
|
|
1407
|
+
maxRuntimeChars: 12e3,
|
|
1408
|
+
cost: { kind: "llm" }
|
|
1409
|
+
};
|
|
1410
|
+
|
|
1326
1411
|
// src/analyst/kinds/knowledge-gap.ts
|
|
1327
|
-
var
|
|
1412
|
+
var ACTOR_PROMPT3 = `You are a knowledge-gap analyst for an OTLP trace dataset. Your job is to identify the **specific pieces of information the agent lacked, or that were stale**, that caused poor decisions.
|
|
1328
1413
|
|
|
1329
1414
|
The agent under analysis maintains a curated knowledge base via \`@tangle-network/agent-knowledge\` \u2014 a wiki of \`KnowledgePage\`s with raw source anchors, claims, and relations. The primary expected store of agent-knowable facts IS that wiki. A "knowledge gap" is anything the agent had to discover or guess at run-time that the wiki should have held \u2014 or an outdated/contradictory fact the agent picked up from a non-wiki source.
|
|
1330
1415
|
|
|
@@ -1372,7 +1457,7 @@ var KNOWLEDGE_GAP_KIND_SPEC = {
|
|
|
1372
1457
|
description: "Identifies missing or stale pieces of knowledge \u2014 primarily against the agent-knowledge wiki \u2014 and attributes each to the runtime layer (wiki page, claim, raw source, websearch, tool-doc, system-prompt, memory) that should have held it.",
|
|
1373
1458
|
area: "knowledge-gap",
|
|
1374
1459
|
version: "1.0.0",
|
|
1375
|
-
actorDescription:
|
|
1460
|
+
actorDescription: ACTOR_PROMPT3,
|
|
1376
1461
|
buildTools: (store) => buildTraceToolsForGroup("discoveryAndSearch", store),
|
|
1377
1462
|
recursion: { maxDepth: 2, maxParallelSubagents: 4 },
|
|
1378
1463
|
maxTurns: 18,
|
|
@@ -1380,7 +1465,7 @@ var KNOWLEDGE_GAP_KIND_SPEC = {
|
|
|
1380
1465
|
};
|
|
1381
1466
|
|
|
1382
1467
|
// src/analyst/kinds/knowledge-poisoning.ts
|
|
1383
|
-
var
|
|
1468
|
+
var ACTOR_PROMPT4 = `You are a knowledge-poisoning analyst for an OTLP trace dataset. Your job is to identify cases where the agent **confidently used wrong information** \u2014 not where it lacked information (that's the knowledge-gap analyst).
|
|
1384
1469
|
|
|
1385
1470
|
DISCOVERY \u2192 DUAL-VERIFY \u2192 CITE protocol:
|
|
1386
1471
|
|
|
@@ -1420,64 +1505,13 @@ var KNOWLEDGE_POISONING_KIND_SPEC = {
|
|
|
1420
1505
|
description: "Identifies confident-but-wrong actions caused by stale memory, contradicting RAG, deprecated tool docs, or outdated system-prompt instructions.",
|
|
1421
1506
|
area: "knowledge-poisoning",
|
|
1422
1507
|
version: "1.0.0",
|
|
1423
|
-
actorDescription:
|
|
1508
|
+
actorDescription: ACTOR_PROMPT4,
|
|
1424
1509
|
buildTools: (store) => buildTraceToolsForGroup("all", store),
|
|
1425
1510
|
recursion: { maxDepth: 2, maxParallelSubagents: 4 },
|
|
1426
1511
|
maxTurns: 20,
|
|
1427
1512
|
cost: { kind: "llm" }
|
|
1428
1513
|
};
|
|
1429
1514
|
|
|
1430
|
-
// src/analyst/kinds/improvement.ts
|
|
1431
|
-
var ACTOR_PROMPT4 = `You are a recursive-self-improvement analyst. Your job is to propose **concrete, locus-named edits** the agent's runtime should adopt to fix the failure modes, knowledge gaps, and poisonings present in this dataset.
|
|
1432
|
-
|
|
1433
|
-
Upstream analysts have already classified the problems. Your job is to convert each problem into a *change to make* and grade its expected leverage. Each finding is one proposed edit.
|
|
1434
|
-
|
|
1435
|
-
DISCOVERY \u2192 CANDIDATE-FIXES \u2192 COMPETE \u2192 CITE protocol:
|
|
1436
|
-
|
|
1437
|
-
1. \`traces.getDatasetOverview({})\` first. Note the agents, tools, and any system-prompt fingerprints (look for the prompt text echoed in early spans).
|
|
1438
|
-
2. For each high-severity failure pattern, generate 2-3 candidate fixes. Real candidate axes:
|
|
1439
|
-
- **System-prompt edit** \u2014 add an instruction, remove a misleading one, restructure precedence
|
|
1440
|
-
- **Tool description edit** \u2014 rewrite a tool's description so the agent picks it correctly / passes valid args
|
|
1441
|
-
- **New tool** \u2014 add a tool the agent kept emulating in code
|
|
1442
|
-
- **RAG ingestion** \u2014 add a document or correct a stale one
|
|
1443
|
-
- **Memory invalidation** \u2014 clear cached prior-run decisions that no longer apply
|
|
1444
|
-
- **Scaffolding** \u2014 add a precondition check, a retry policy, a turn budget, a verification step
|
|
1445
|
-
- **Output schema** \u2014 narrow the agent's output to forbid the failure shape
|
|
1446
|
-
3. **Compete candidate fixes via subagents.** For each failure cluster, spawn one \`llmQuery\` per candidate-fix axis you want to evaluate. Each subagent's job: simulate the fix on the cited traces and report (i) likely effect, (ii) side effects, (iii) implementation cost as small/medium/large. Pass the cluster's failing trace_ids and the candidate axis as context.
|
|
1447
|
-
4. After subagents return, **pick the winning candidate per cluster** based on (effect / cost) and emit ONE finding. Discard the losing candidates \u2014 the output is the recommendation, not the candidate set.
|
|
1448
|
-
5. **Cross-reference upstream findings.** If a finding cites a prior failure-mode or knowledge-gap finding, use \`evidence_uri = "finding://<prior-finding-id>"\` (the registry supports this kind). This builds the dependency graph that lets the dashboard show "fix #X resolves failure modes A, B, C."
|
|
1449
|
-
|
|
1450
|
-
For each winning recommendation, emit ONE finding with:
|
|
1451
|
-
- \`area\` = "improvement"
|
|
1452
|
-
- \`subject\` = the locus to edit: \`system-prompt:<section>\`, \`tool-doc:<tool-name>\`, \`new-tool:<proposed-name>\`, \`rag:<corpus>:<doc-id>\`, \`memory:<key>\`, \`scaffolding:<concern>\`, \`output-schema:<field>\`
|
|
1453
|
-
- \`claim\` = one sentence stating the edit ("Add a precondition check to refuse tool X calls without arg Y")
|
|
1454
|
-
- \`severity\` = leverage rating: "critical" when fix resolves a critical failure mode; "high" when it resolves a high; "medium" when it's a quality-of-life win; "info" when it's a cleanup with no behavioral effect
|
|
1455
|
-
- \`evidence_uri\` = the failure-mode finding id this fix targets (\`finding://<id>\`) when it exists; else the most representative span
|
|
1456
|
-
- \`evidence_excerpt\` = a fragment showing the problem the fix targets
|
|
1457
|
-
- \`confidence\` = 0.85+ when the fix is mechanical and the failure mode is well-evidenced; 0.6-0.8 when the fix requires judgment; <0.5 for speculative
|
|
1458
|
-
- \`rationale\` = why this candidate beat its alternatives (2 sentences max)
|
|
1459
|
-
- \`recommended_action\` = the **literal edit**, phrased as a diff or a quoted replacement: "Replace section X with: '...'" or "Add tool with description: '...'" or "Set retry policy to max_attempts=3 with exponential backoff"
|
|
1460
|
-
|
|
1461
|
-
If no upstream failure findings exist in this run, derive your own from the trace dataset using the failure-mode protocol inline (\`searchTrace\` for STATUS_CODE_ERROR / MaxTurnsExceeded / etc.). But prefer to consume upstream findings when present \u2014 the kinds are designed to chain.
|
|
1462
|
-
|
|
1463
|
-
Do NOT propose a fix you cannot defend with evidence. "Tighten the prompt" is not a finding; "Add 'When the user asks for X, always Y' to the system prompt section "request-classification"" is.
|
|
1464
|
-
|
|
1465
|
-
OBSERVABILITY rules:
|
|
1466
|
-
- Each non-final turn must emit at least one \`console.log\` for evidence.
|
|
1467
|
-
- Call \`final({ findings: [...] })\` exactly once at the top level.`;
|
|
1468
|
-
var IMPROVEMENT_KIND_SPEC = {
|
|
1469
|
-
id: "improvement",
|
|
1470
|
-
description: "Converts upstream failure / gap / poisoning findings into concrete locus-named edits (prompt, tool-doc, RAG, scaffolding) with leverage grades.",
|
|
1471
|
-
area: "improvement",
|
|
1472
|
-
version: "1.0.0",
|
|
1473
|
-
actorDescription: ACTOR_PROMPT4,
|
|
1474
|
-
buildTools: (store) => buildTraceToolsForGroup("all", store),
|
|
1475
|
-
recursion: { maxDepth: 3, maxParallelSubagents: 4 },
|
|
1476
|
-
maxTurns: 30,
|
|
1477
|
-
maxRuntimeChars: 12e3,
|
|
1478
|
-
cost: { kind: "llm" }
|
|
1479
|
-
};
|
|
1480
|
-
|
|
1481
1515
|
// src/analyst/kinds/index.ts
|
|
1482
1516
|
var DEFAULT_TRACE_ANALYST_KINDS = [
|
|
1483
1517
|
FAILURE_MODE_KIND_SPEC,
|
|
@@ -1513,6 +1547,23 @@ var AnalystRegistry = class {
|
|
|
1513
1547
|
}));
|
|
1514
1548
|
}
|
|
1515
1549
|
async run(runId, inputs, runOpts = {}) {
|
|
1550
|
+
for await (const ev of this.runStream(runId, inputs, runOpts)) {
|
|
1551
|
+
if (ev.type === "run-completed") return ev.result;
|
|
1552
|
+
}
|
|
1553
|
+
throw new Error("AnalystRegistry.run: stream completed without run-completed event");
|
|
1554
|
+
}
|
|
1555
|
+
/**
|
|
1556
|
+
* Streaming counterpart to `run()`. Emits `AnalystRunEvent` values
|
|
1557
|
+
* in real time — `run-started`, then per-analyst `skipped` /
|
|
1558
|
+
* `started` / `completed`, then a terminal `run-completed` whose
|
|
1559
|
+
* payload is the full `AnalystRunResult`. UIs use this to render
|
|
1560
|
+
* progress; persistence consumers use `run()` and read the result.
|
|
1561
|
+
*
|
|
1562
|
+
* Hooks (`onBeforeAnalyze` / `onAfterAnalyze` / `onError` /
|
|
1563
|
+
* `onComplete`) fire as before — streaming is additive, not a hook
|
|
1564
|
+
* replacement.
|
|
1565
|
+
*/
|
|
1566
|
+
async *runStream(runId, inputs, runOpts = {}) {
|
|
1516
1567
|
const correlationId = `ar_${randomUUID().slice(0, 12)}`;
|
|
1517
1568
|
const log = this.options.log ?? (() => {
|
|
1518
1569
|
});
|
|
@@ -1522,6 +1573,13 @@ var AnalystRegistry = class {
|
|
|
1522
1573
|
const deadlineMs = runOpts.timeoutMs ? started + runOpts.timeoutMs : void 0;
|
|
1523
1574
|
const selected = this.selectAnalysts(runOpts);
|
|
1524
1575
|
const budget = runOpts.budget ?? this.options.defaultBudget;
|
|
1576
|
+
yield {
|
|
1577
|
+
type: "run-started",
|
|
1578
|
+
run_id: runId,
|
|
1579
|
+
correlation_id: correlationId,
|
|
1580
|
+
started_at: startedAt,
|
|
1581
|
+
analyst_ids: selected.map((a) => a.id)
|
|
1582
|
+
};
|
|
1525
1583
|
const summaries = [];
|
|
1526
1584
|
const allFindings = [];
|
|
1527
1585
|
let totalCost = 0;
|
|
@@ -1541,6 +1599,7 @@ var AnalystRegistry = class {
|
|
|
1541
1599
|
summaries.push(summary);
|
|
1542
1600
|
log(`[analyst] skip ${analyst.id} \u2014 missing input`, { runId, kind: analyst.inputKind });
|
|
1543
1601
|
await hooks.onAfterAnalyze?.({ analyst, summary, findings: [], runId });
|
|
1602
|
+
yield { type: "analyst-skipped", summary };
|
|
1544
1603
|
continue;
|
|
1545
1604
|
}
|
|
1546
1605
|
const perBudget = allocateBudget(budget, {
|
|
@@ -1556,9 +1615,15 @@ var AnalystRegistry = class {
|
|
|
1556
1615
|
chat: this.options.chat,
|
|
1557
1616
|
tags: runOpts.tags,
|
|
1558
1617
|
log: (msg, fields) => log(`[${analyst.id}] ${msg}`, { runId, correlationId, ...fields }),
|
|
1559
|
-
signal: runOpts.signal
|
|
1618
|
+
signal: runOpts.signal,
|
|
1619
|
+
priorFindings: selectPriorFindings(runOpts.priorFindings, analyst.id)
|
|
1560
1620
|
};
|
|
1561
1621
|
await hooks.onBeforeAnalyze?.({ analyst, ctx, runId });
|
|
1622
|
+
yield {
|
|
1623
|
+
type: "analyst-started",
|
|
1624
|
+
analyst_id: analyst.id,
|
|
1625
|
+
started_at: new Date(t0).toISOString()
|
|
1626
|
+
};
|
|
1562
1627
|
try {
|
|
1563
1628
|
const findings = await analyst.analyze(input.value, ctx);
|
|
1564
1629
|
const latency = Date.now() - t0;
|
|
@@ -1581,6 +1646,7 @@ var AnalystRegistry = class {
|
|
|
1581
1646
|
cost_usd: cost
|
|
1582
1647
|
});
|
|
1583
1648
|
await hooks.onAfterAnalyze?.({ analyst, summary, findings, runId });
|
|
1649
|
+
yield { type: "analyst-completed", summary, findings };
|
|
1584
1650
|
} catch (err) {
|
|
1585
1651
|
const latency = Date.now() - t0;
|
|
1586
1652
|
const e = err instanceof Error ? err : new Error(String(err));
|
|
@@ -1601,6 +1667,7 @@ var AnalystRegistry = class {
|
|
|
1601
1667
|
error: e.message
|
|
1602
1668
|
});
|
|
1603
1669
|
await hooks.onAfterAnalyze?.({ analyst, summary, findings: hookFindings, runId });
|
|
1670
|
+
yield { type: "analyst-completed", summary, findings: hookFindings };
|
|
1604
1671
|
}
|
|
1605
1672
|
}
|
|
1606
1673
|
const result = {
|
|
@@ -1613,7 +1680,7 @@ var AnalystRegistry = class {
|
|
|
1613
1680
|
total_cost_usd: totalCost
|
|
1614
1681
|
};
|
|
1615
1682
|
await hooks.onComplete?.({ result });
|
|
1616
|
-
|
|
1683
|
+
yield { type: "run-completed", result };
|
|
1617
1684
|
}
|
|
1618
1685
|
selectAnalysts(opts) {
|
|
1619
1686
|
let candidates = Array.from(this.analysts.values());
|
|
@@ -1670,6 +1737,18 @@ function sumFindingCost(findings) {
|
|
|
1670
1737
|
}
|
|
1671
1738
|
return sum2;
|
|
1672
1739
|
}
|
|
1740
|
+
function selectPriorFindings(source, analystId) {
|
|
1741
|
+
if (!source) return void 0;
|
|
1742
|
+
if (Array.isArray(source)) {
|
|
1743
|
+
const own2 = source.filter((f) => f.analyst_id === analystId);
|
|
1744
|
+
return own2.length > 0 ? own2 : void 0;
|
|
1745
|
+
}
|
|
1746
|
+
const record = source;
|
|
1747
|
+
const own = record[analystId] ?? [];
|
|
1748
|
+
const wildcard = record["*"] ?? [];
|
|
1749
|
+
const merged = [...own, ...wildcard];
|
|
1750
|
+
return merged.length > 0 ? merged : void 0;
|
|
1751
|
+
}
|
|
1673
1752
|
|
|
1674
1753
|
// src/auto-pr.ts
|
|
1675
1754
|
async function proposeAutomatedPullRequest(client, input) {
|
|
@@ -10320,6 +10399,7 @@ export {
|
|
|
10320
10399
|
renderMarkdownReport,
|
|
10321
10400
|
renderPlaybookMarkdown,
|
|
10322
10401
|
renderPreferenceMemoryMarkdown,
|
|
10402
|
+
renderPriorFindings,
|
|
10323
10403
|
renderReleaseReport,
|
|
10324
10404
|
renderSteeringText,
|
|
10325
10405
|
replayFeedbackTrajectories,
|