@tangle-network/agent-eval 0.72.0 → 0.72.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/CHANGELOG.md +39 -0
  2. package/dist/adapters/http.d.ts +1 -1
  3. package/dist/adapters/langchain.d.ts +1 -1
  4. package/dist/adapters/otel.d.ts +3 -2
  5. package/dist/agent-profile-DYRboYWu.d.ts +364 -0
  6. package/dist/analyst/index.d.ts +221 -0
  7. package/dist/analyst/index.js +371 -0
  8. package/dist/analyst/index.js.map +1 -0
  9. package/dist/analyst-t7zZS3TV.d.ts +88 -0
  10. package/dist/campaign/index.d.ts +518 -9
  11. package/dist/campaign/index.js +672 -22
  12. package/dist/campaign/index.js.map +1 -1
  13. package/dist/chunk-7W4SM7FD.js +1075 -0
  14. package/dist/chunk-7W4SM7FD.js.map +1 -0
  15. package/dist/{chunk-AIWHLG7J.js → chunk-GJJNJVIR.js} +11 -11
  16. package/dist/chunk-JHA3ZGSO.js +1496 -0
  17. package/dist/chunk-JHA3ZGSO.js.map +1 -0
  18. package/dist/{chunk-4QJN7RDX.js → chunk-JYE3WOTE.js} +55 -7
  19. package/dist/{chunk-4QJN7RDX.js.map → chunk-JYE3WOTE.js.map} +1 -1
  20. package/dist/chunk-LB2UOI5F.js +412 -0
  21. package/dist/chunk-LB2UOI5F.js.map +1 -0
  22. package/dist/{chunk-ODGETRTM.js → chunk-VUINJM5M.js} +234 -1415
  23. package/dist/chunk-VUINJM5M.js.map +1 -0
  24. package/dist/chunk-WYIHD6EB.js +1044 -0
  25. package/dist/chunk-WYIHD6EB.js.map +1 -0
  26. package/dist/{chunk-UD6EF73X.js → chunk-XPILG2CA.js} +119 -2
  27. package/dist/chunk-XPILG2CA.js.map +1 -0
  28. package/dist/contract/index.d.ts +17 -13
  29. package/dist/contract/index.js +13 -7
  30. package/dist/contract/index.js.map +1 -1
  31. package/dist/{control-DxvZeV5X.d.ts → control-BgA6BYTm.d.ts} +1 -1
  32. package/dist/control.d.ts +2 -2
  33. package/dist/{feedback-trajectory-8hKC5EOb.d.ts → feedback-trajectory-B3rErRsh.d.ts} +1 -1
  34. package/dist/harness-optimizer-EnEnQPsr.d.ts +106 -0
  35. package/dist/hosted/index.d.ts +223 -2
  36. package/dist/index.d.ts +49 -1323
  37. package/dist/index.js +353 -2496
  38. package/dist/index.js.map +1 -1
  39. package/dist/{index-BGBrVS24.d.ts → insight-report-Df3lxYXM.d.ts} +1 -221
  40. package/dist/kind-factory-DW9XWPvM.d.ts +172 -0
  41. package/dist/multi-layer-verifier-DlWCXuxL.d.ts +141 -0
  42. package/dist/openapi.json +1 -1
  43. package/dist/pareto-E-pembql.d.ts +81 -0
  44. package/dist/{provenance-C69gLUXH.d.ts → provenance-B-TFszPW.d.ts} +131 -4
  45. package/dist/redact-B40YG2M_.d.ts +45 -0
  46. package/dist/registry-DuVYiTvw.d.ts +128 -0
  47. package/dist/{researcher-WJvIpX3L.d.ts → researcher-C_KJyIGg.d.ts} +1 -141
  48. package/dist/rl.d.ts +4 -3
  49. package/dist/rl.js +4 -4
  50. package/dist/run-critic-BAIjX99r.d.ts +56 -0
  51. package/dist/{run-improvement-loop-Bzamo6GB.d.ts → run-improvement-loop-BqYH2vCR.d.ts} +25 -1
  52. package/dist/semantic-concept-judge-CV9Wlx4t.d.ts +650 -0
  53. package/dist/{store-jzKpMl16.d.ts → store-GmBE2pZZ.d.ts} +1 -1
  54. package/dist/traces.d.ts +371 -308
  55. package/dist/traces.js +43 -18
  56. package/dist/{types-CnmZ2bkP.d.ts → types-Bba0vl1V.d.ts} +1 -1
  57. package/dist/{registry-BGKyX6bw.d.ts → types-CRD68aH7.d.ts} +3 -128
  58. package/dist/wire/index.d.ts +1 -1
  59. package/dist/workflow/index.d.ts +494 -0
  60. package/dist/workflow/index.js +2177 -0
  61. package/dist/workflow/index.js.map +1 -0
  62. package/docs/design/self-improvement-roadmap.md +106 -0
  63. package/package.json +36 -12
  64. package/dist/agent-profile-DzcPHR1Z.d.ts +0 -114
  65. package/dist/chunk-ODGETRTM.js.map +0 -1
  66. package/dist/chunk-SL55X4VN.js +0 -186
  67. package/dist/chunk-SL55X4VN.js.map +0 -1
  68. package/dist/chunk-UD6EF73X.js.map +0 -1
  69. /package/dist/{chunk-AIWHLG7J.js.map → chunk-GJJNJVIR.js.map} +0 -0
@@ -0,0 +1,1044 @@
1
+ import {
2
+ callLlm
3
+ } from "./chunk-IHDHUN2X.js";
4
+ import {
5
+ TraceFileMissingError,
6
+ buildTraceAnalystTools
7
+ } from "./chunk-VUINJM5M.js";
8
+
9
+ // src/analyst/types.ts
10
+ import { createHash } from "crypto";
11
+ function computeFindingId(input) {
12
+ const basis = JSON.stringify({
13
+ a: input.analyst_id,
14
+ r: input.area,
15
+ s: input.subject ?? "",
16
+ c: normalizeClaim(input.id_basis ?? input.claim)
17
+ });
18
+ return `f_${createHash("sha256").update(basis).digest("hex").slice(0, 20)}`;
19
+ }
20
+ function normalizeClaim(c) {
21
+ return c.toLowerCase().replace(/\s+/g, " ").replace(/[.!?;:,]+$/g, "").trim();
22
+ }
23
+ function makeFinding(init) {
24
+ const { id_basis, produced_at, ...rest } = init;
25
+ return {
26
+ schema_version: "1.0.0",
27
+ finding_id: computeFindingId({
28
+ analyst_id: rest.analyst_id,
29
+ area: rest.area,
30
+ subject: rest.subject,
31
+ claim: rest.claim,
32
+ id_basis
33
+ }),
34
+ produced_at: produced_at ?? (/* @__PURE__ */ new Date()).toISOString(),
35
+ ...rest
36
+ };
37
+ }
38
+
39
+ // src/analyst/finding-subject.ts
40
+ import { z } from "zod";
41
+ var FINDING_SUBJECT_KINDS = [
42
+ "knowledge.wiki",
43
+ "knowledge.claim",
44
+ "knowledge.raw",
45
+ "knowledge.stale",
46
+ "system-prompt",
47
+ "tool-doc",
48
+ "new-tool",
49
+ "rag",
50
+ "memory",
51
+ "scaffolding",
52
+ "output-schema",
53
+ "websearch.outdated",
54
+ "prior-run-summary",
55
+ "cluster"
56
+ ];
57
+ function parseFindingSubject(raw) {
58
+ if (raw === null || raw === void 0) return null;
59
+ const trimmed = raw.trim();
60
+ if (trimmed.length === 0) return null;
61
+ const wiki = trimmed.match(
62
+ /^agent-knowledge:wiki:([a-z0-9][a-z0-9-]*)(?:#([a-z0-9][a-z0-9-]*))?$/
63
+ );
64
+ if (wiki)
65
+ return { kind: "knowledge.wiki", slug: wiki[1], ...wiki[2] ? { heading: wiki[2] } : {} };
66
+ const claim = trimmed.match(/^agent-knowledge:claim:(.+)$/);
67
+ if (claim && claim[1].trim().length > 0)
68
+ return { kind: "knowledge.claim", topic: claim[1].trim() };
69
+ const raw_ = trimmed.match(/^agent-knowledge:raw:(.+)$/);
70
+ if (raw_ && raw_[1].trim().length > 0)
71
+ return { kind: "knowledge.raw", sourceId: raw_[1].trim() };
72
+ const stale = trimmed.match(/^agent-knowledge:stale:([a-z0-9][a-z0-9-]*)$/);
73
+ if (stale) return { kind: "knowledge.stale", slug: stale[1] };
74
+ const sp = trimmed.match(/^system-prompt:(.+)$/);
75
+ if (sp && sp[1].trim().length > 0) return { kind: "system-prompt", section: sp[1].trim() };
76
+ const tdAspect = trimmed.match(/^tool-doc:([a-z0-9][a-z0-9_-]*):(.+)$/);
77
+ if (tdAspect && tdAspect[2].trim().length > 0) {
78
+ return { kind: "tool-doc", tool: tdAspect[1], aspect: tdAspect[2].trim() };
79
+ }
80
+ const td = trimmed.match(/^tool-doc:([a-z0-9][a-z0-9_-]*)$/);
81
+ if (td) return { kind: "tool-doc", tool: td[1] };
82
+ const nt = trimmed.match(/^new-tool:([a-z0-9][a-z0-9_-]*)$/);
83
+ if (nt) return { kind: "new-tool", name: nt[1] };
84
+ const rag = trimmed.match(/^rag:([a-z0-9][a-z0-9_-]*):(.+)$/);
85
+ if (rag && rag[2].trim().length > 0) {
86
+ return { kind: "rag", corpus: rag[1], docId: rag[2].trim() };
87
+ }
88
+ const mem = trimmed.match(/^memory:(.+)$/);
89
+ if (mem && mem[1].trim().length > 0) return { kind: "memory", key: mem[1].trim() };
90
+ const sc = trimmed.match(/^scaffolding:(.+)$/);
91
+ if (sc && sc[1].trim().length > 0) return { kind: "scaffolding", concern: sc[1].trim() };
92
+ const os = trimmed.match(/^output-schema:(.+)$/);
93
+ if (os && os[1].trim().length > 0) return { kind: "output-schema", field: os[1].trim() };
94
+ const ws = trimmed.match(/^websearch:outdated:(.+)$/);
95
+ if (ws && ws[1].trim().length > 0) return { kind: "websearch.outdated", topic: ws[1].trim() };
96
+ const prs = trimmed.match(/^prior-run-summary:(.+)$/);
97
+ if (prs && prs[1].trim().length > 0) return { kind: "prior-run-summary", topic: prs[1].trim() };
98
+ if (/^[a-z0-9][a-z0-9._-]*$/.test(trimmed) && trimmed.length <= 80) {
99
+ return { kind: "cluster", label: trimmed };
100
+ }
101
+ return null;
102
+ }
103
+ function renderFindingSubject(s) {
104
+ switch (s.kind) {
105
+ case "knowledge.wiki":
106
+ return s.heading ? `agent-knowledge:wiki:${s.slug}#${s.heading}` : `agent-knowledge:wiki:${s.slug}`;
107
+ case "knowledge.claim":
108
+ return `agent-knowledge:claim:${s.topic}`;
109
+ case "knowledge.raw":
110
+ return `agent-knowledge:raw:${s.sourceId}`;
111
+ case "knowledge.stale":
112
+ return `agent-knowledge:stale:${s.slug}`;
113
+ case "system-prompt":
114
+ return `system-prompt:${s.section}`;
115
+ case "tool-doc":
116
+ return s.aspect ? `tool-doc:${s.tool}:${s.aspect}` : `tool-doc:${s.tool}`;
117
+ case "new-tool":
118
+ return `new-tool:${s.name}`;
119
+ case "rag":
120
+ return `rag:${s.corpus}:${s.docId}`;
121
+ case "memory":
122
+ return `memory:${s.key}`;
123
+ case "scaffolding":
124
+ return `scaffolding:${s.concern}`;
125
+ case "output-schema":
126
+ return `output-schema:${s.field}`;
127
+ case "websearch.outdated":
128
+ return `websearch:outdated:${s.topic}`;
129
+ case "prior-run-summary":
130
+ return `prior-run-summary:${s.topic}`;
131
+ case "cluster":
132
+ return s.label;
133
+ }
134
+ }
135
+ var FINDING_SUBJECT_GRAMMAR_PROMPT = [
136
+ "Subjects MUST match this grammar \u2014 anything else is rejected at parse time and your work is wasted:",
137
+ "",
138
+ " Knowledge loci (write to the agent-knowledge base):",
139
+ " agent-knowledge:wiki:<slug>[#<heading>] create / update a wiki page",
140
+ " agent-knowledge:claim:<topic> draft a claim / relation triple",
141
+ " agent-knowledge:raw:<source-id> lift a raw source into a curated page",
142
+ " agent-knowledge:stale:<slug> mark a page superseded",
143
+ "",
144
+ " Runtime mutable surfaces (write to prompts / tools / scaffolding):",
145
+ " system-prompt:<section> add / replace a system-prompt section",
146
+ " tool-doc:<tool>[:<aspect>] rewrite a tool description",
147
+ " new-tool:<name> propose a new tool surface",
148
+ " rag:<corpus>:<doc-id> ingest / correct a RAG document",
149
+ " memory:<key> invalidate / set a memory entry",
150
+ " scaffolding:<concern> change a precondition / retry / verifier",
151
+ " output-schema:<field> constrain the agent output shape",
152
+ "",
153
+ " Stale signals (knowledge-poisoning only):",
154
+ " websearch:outdated:<topic> stale web result",
155
+ " prior-run-summary:<topic> stale prior-run summary",
156
+ "",
157
+ " Cluster label (failure-mode only):",
158
+ ' <kebab-case-label> short cluster id, e.g. "tool-call-loop"',
159
+ "",
160
+ "Slugs / tool ids: [a-z0-9-]+ (lowercase kebab). Topics / keys / sections: free-form, trimmed."
161
+ ].join("\n");
162
+ var KIND_EXPECTED_SUBJECTS = {
163
+ "failure-mode": ["cluster"],
164
+ "knowledge-gap": [
165
+ "knowledge.wiki",
166
+ "knowledge.claim",
167
+ "knowledge.raw",
168
+ "knowledge.stale",
169
+ "tool-doc",
170
+ "system-prompt",
171
+ "memory",
172
+ "websearch.outdated",
173
+ "prior-run-summary"
174
+ ],
175
+ "knowledge-poisoning": [
176
+ "knowledge.wiki",
177
+ "knowledge.claim",
178
+ "knowledge.raw",
179
+ "tool-doc",
180
+ "system-prompt",
181
+ "memory",
182
+ "websearch.outdated",
183
+ "prior-run-summary"
184
+ ],
185
+ improvement: [
186
+ "system-prompt",
187
+ "tool-doc",
188
+ "new-tool",
189
+ "rag",
190
+ "memory",
191
+ "scaffolding",
192
+ "output-schema",
193
+ "knowledge.wiki",
194
+ "knowledge.claim"
195
+ ]
196
+ };
197
+ var FindingSubjectStringSchema = z.string().refine((s) => parseFindingSubject(s) !== null, {
198
+ message: "subject does not match the finding-subject grammar"
199
+ });
200
+
201
+ // src/analyst/parse-tolerant.ts
202
+ function stripCodeFences(text) {
203
+ const t = text.trim();
204
+ const fence = /^```[a-zA-Z0-9]*\s*\n?([\s\S]*?)\n?```$/;
205
+ const m = t.match(fence);
206
+ return m ? m[1].trim() : t;
207
+ }
208
+ function dropTrailingCommas(s) {
209
+ return s.replace(/,(\s*[}\]])/g, "$1");
210
+ }
211
+ function coerceJson(text) {
212
+ const candidate = dropTrailingCommas(stripCodeFences(text));
213
+ try {
214
+ return JSON.parse(candidate);
215
+ } catch {
216
+ return void 0;
217
+ }
218
+ }
219
+ function coerceToFindingRows(raw) {
220
+ let value = raw;
221
+ if (typeof value === "string") {
222
+ const parsed = coerceJson(value);
223
+ if (parsed === void 0) return [];
224
+ value = parsed;
225
+ }
226
+ if (Array.isArray(value)) return value;
227
+ if (value && typeof value === "object") {
228
+ const inner = value.findings;
229
+ if (Array.isArray(inner)) return inner;
230
+ return [value];
231
+ }
232
+ return [];
233
+ }
234
+
235
+ // src/analyst/finding-signature.ts
236
+ import { z as z2 } from "zod";
237
+ var ANALYST_SEVERITIES = ["critical", "high", "medium", "low", "info"];
238
+ var RawAnalystFindingSchema = z2.object({
239
+ severity: z2.enum(ANALYST_SEVERITIES),
240
+ claim: z2.string().min(1).max(2e3),
241
+ /**
242
+ * Subject locus the finding is about. Validated at parse time
243
+ * against the documented grammar (`finding-subject.ts`). Findings
244
+ * with a malformed subject are rejected — they would have been
245
+ * silently skipped by every downstream adapter, so failing loud at
246
+ * parse time turns a hidden no-op into a kind-prompt audit signal.
247
+ *
248
+ * Optional because purely descriptive findings (no actionable
249
+ * locus) are legitimate; they just don't route through the
250
+ * KnowledgeAdapter / ImprovementAdapter.
251
+ */
252
+ subject: z2.string().max(400).refine((s) => parseFindingSubject(s) !== null, {
253
+ message: "subject does not match the finding-subject grammar"
254
+ }).optional(),
255
+ evidence_uri: z2.string().min(1).max(2e3),
256
+ evidence_excerpt: z2.string().max(2e3).optional(),
257
+ confidence: z2.number().min(0).max(1),
258
+ rationale: z2.string().max(4e3).optional(),
259
+ recommended_action: z2.string().max(2e3).optional()
260
+ }).strict();
261
+ var RAW_FINDING_SCHEMA_PROMPT = `Each finding MUST be a JSON object with these fields:
262
+ - severity: one of "critical" | "high" | "medium" | "low" | "info"
263
+ - claim: one-sentence statement (max 2000 chars)
264
+ - subject?: the routing locus this finding is about. It MUST be one of the exact subject forms listed in this kind's instructions above (e.g. \`system-prompt:<section>\`, \`agent-knowledge:wiki:<slug>\`, \`tool-doc:<tool>\`). A free phrase, a bare noun, or any form not in that list is REJECTED at parse time and the finding is discarded \u2014 omit subject entirely rather than guess a form.
265
+ - evidence_uri: REQUIRED, never blank. Exactly one of "span://<trace_id>/<span_id>" (trace evidence), "artifact://<relative-path>" (files), "metric://<name>" (named scalars) \u2014 ALWAYS cite a real id surfaced by the tools. If you have no citable id, do not emit the finding.
266
+ - evidence_excerpt?: short quote (<=2000 chars) from the cited span/artifact
267
+ - confidence: number 0..1 \u2014 0.9+ when backed by exact quotes, 0.6-0.8 for inferred patterns, <0.5 for speculative
268
+ - rationale?: one or two sentences explaining the reasoning
269
+ - recommended_action?: concrete change phrased as an imperative ("Add ...", "Replace ...", "Stop ...") \u2014 omit when the finding is purely descriptive
270
+
271
+ Emit an empty array when the question has no findings to report. Do not fabricate evidence.`;
272
+ function parseRawFinding(row, log) {
273
+ const result = RawAnalystFindingSchema.safeParse(row);
274
+ if (result.success) return result.data;
275
+ if (typeof row === "string") {
276
+ const coerced = coerceJson(row);
277
+ if (coerced !== void 0) {
278
+ const retry = RawAnalystFindingSchema.safeParse(coerced);
279
+ if (retry.success) return retry.data;
280
+ }
281
+ }
282
+ log?.("finding rejected: schema failure", {
283
+ issues: result.error.issues.map((i) => ({
284
+ path: i.path.join("."),
285
+ code: i.code,
286
+ message: i.message
287
+ }))
288
+ });
289
+ return null;
290
+ }
291
+
292
+ // src/analyst/structure-findings.ts
293
+ var SYSTEM = [
294
+ "You convert a free-form trace-analysis report into a STRICT JSON array of findings.",
295
+ "Output ONLY the JSON array \u2014 no prose, no code fences.",
296
+ 'Each element: {"severity":"critical|high|medium|low|info","claim":string,"evidence_uri":string,',
297
+ '"subject"?:string,"rationale"?:string,"recommended_action"?:string,"confidence":number(0..1)}.',
298
+ 'evidence_uri cites the trace element the report referenced (e.g. "span://<trace>/<span>") or "report://summary".',
299
+ "If the report asserts NO problems, output exactly []."
300
+ ].join(" ");
301
+ function buildRows(raw, analystId, area) {
302
+ const rows = coerceToFindingRows(raw);
303
+ const out = [];
304
+ for (const row of rows) {
305
+ const normalized = row && typeof row === "object" && !Array.isArray(row) && !row.evidence_uri ? { ...row, evidence_uri: "report://summary" } : row;
306
+ const parsed = parseRawFinding(normalized);
307
+ if (!parsed) continue;
308
+ out.push(
309
+ makeFinding({
310
+ analyst_id: analystId,
311
+ area,
312
+ subject: parsed.subject,
313
+ claim: parsed.claim,
314
+ rationale: parsed.rationale,
315
+ severity: parsed.severity,
316
+ confidence: parsed.confidence,
317
+ evidence_refs: [
318
+ {
319
+ kind: parsed.evidence_uri.startsWith("span://") ? "span" : "artifact",
320
+ uri: parsed.evidence_uri,
321
+ excerpt: parsed.evidence_excerpt
322
+ }
323
+ ],
324
+ recommended_action: parsed.recommended_action
325
+ })
326
+ );
327
+ }
328
+ return out;
329
+ }
330
+ async function structureFindings(opts) {
331
+ const maxReasks = opts.maxReasks ?? 1;
332
+ const llm = { baseUrl: opts.baseUrl, apiKey: opts.apiKey, fetch: opts.fetchImpl };
333
+ let user = `TRACE-ANALYSIS REPORT:
334
+ ${opts.report}
335
+
336
+ Return the findings JSON array.`;
337
+ for (let attempt = 0; attempt <= maxReasks; attempt++) {
338
+ const res = await callLlm(
339
+ {
340
+ model: opts.model,
341
+ messages: [
342
+ { role: "system", content: SYSTEM },
343
+ { role: "user", content: user }
344
+ ]
345
+ },
346
+ llm
347
+ );
348
+ const text = res.content.trim();
349
+ const findings = buildRows(text, opts.analystId, opts.area);
350
+ if (findings.length > 0) return { findings, outcome: "ok" };
351
+ if (opts.report.trim().length < 200) return { findings: [], outcome: "ok" };
352
+ user = `${user}
353
+
354
+ That produced no valid findings. The report DOES describe issues \u2014 re-extract them as the strict JSON array described in the system prompt. Output ONLY the array.`;
355
+ }
356
+ return { findings: [], outcome: "extraction_failed" };
357
+ }
358
+
359
+ // src/analyst/kind-factory.ts
360
+ import { AxJSRuntime, agent } from "@ax-llm/ax";
361
+ function createTraceAnalystKind(spec, opts) {
362
+ const version = opts.versionSuffix ? `${spec.version}+${opts.versionSuffix}` : spec.version;
363
+ return {
364
+ id: spec.id,
365
+ description: spec.description,
366
+ inputKind: "trace-store",
367
+ cost: spec.cost,
368
+ version,
369
+ async analyze(store, ctx) {
370
+ const tools = spec.buildTools(store);
371
+ const maxDepth = spec.recursion?.maxDepth ?? 0;
372
+ const maxParallel = spec.recursion?.maxParallelSubagents ?? 2;
373
+ const priorContext = renderPriorFindings(ctx.priorFindings);
374
+ const actorDescription = spec.actorDescription.trim() + priorContext + "\n\n" + RAW_FINDING_SCHEMA_PROMPT + "\n\nFirst write `report`: a concise free-form prose diagnosis of what the traces show \u2014 what succeeded, what was suboptimal or failed \u2014 with concrete trace ids and numbers. THEN return the structured `findings` array (it MAY be empty when there is nothing to report). Use `final(...)` with the `{ report, findings }` payload when you are done.";
375
+ const ax = agent(
376
+ "question:string -> report:string, findings:json[]",
377
+ {
378
+ agentIdentity: {
379
+ name: spec.id,
380
+ description: spec.description
381
+ },
382
+ contextFields: ["question"],
383
+ runtime: new AxJSRuntime({
384
+ permissions: [],
385
+ blockDynamicImport: true,
386
+ allowedModules: [],
387
+ freezeIntrinsics: true,
388
+ blockShadowRealm: true,
389
+ preventGlobalThisExtensions: false
390
+ }),
391
+ mode: maxDepth > 0 ? "advanced" : "simple",
392
+ recursionOptions: maxDepth > 0 ? { maxDepth } : void 0,
393
+ maxTurns: spec.maxTurns ?? 12,
394
+ maxRuntimeChars: spec.maxRuntimeChars ?? 6e3,
395
+ maxBatchedLlmQueryConcurrency: maxParallel,
396
+ promptLevel: "detailed",
397
+ // Trace analysis depends on exact prior tool results and runtime variables.
398
+ contextPolicy: { preset: "full", budget: "balanced" },
399
+ functions: { local: tools },
400
+ actorOptions: {
401
+ description: actorDescription,
402
+ ...opts.model ? { model: opts.model } : {},
403
+ showThoughts: false,
404
+ thinkingTokenBudget: "none"
405
+ },
406
+ responderOptions: {
407
+ description: spec.responderDescription ?? "Pass through the actor's `report` prose verbatim, and format the `findings` array exactly as the actor produced it. Do not add, drop, or summarize entries.",
408
+ ...opts.model ? { model: opts.model } : {},
409
+ showThoughts: false
410
+ },
411
+ bubbleErrors: [TraceFileMissingError]
412
+ }
413
+ );
414
+ ctx.log?.(`analyst.kind ${spec.id} forward`, {
415
+ max_depth: maxDepth,
416
+ tool_count: tools.length,
417
+ tags: ctx.tags
418
+ });
419
+ const result = await ax.forward(opts.ai, { question: deriveQuestion(ctx, spec) });
420
+ const expectedSubjects = KIND_EXPECTED_SUBJECTS[spec.id];
421
+ const out = [];
422
+ const rawRows = Array.isArray(result.findings) ? result.findings : [];
423
+ let rejectedWrongKind = 0;
424
+ for (const row of rawRows) {
425
+ const parsed = parseRawFinding(row, ctx.log);
426
+ if (!parsed) continue;
427
+ if (expectedSubjects && parsed.subject !== void 0) {
428
+ const parsedSubject = parseFindingSubject(parsed.subject);
429
+ if (parsedSubject === null) {
430
+ ctx.log?.("finding rejected: subject failed to parse", {
431
+ kind: spec.id,
432
+ subject: parsed.subject
433
+ });
434
+ rejectedWrongKind += 1;
435
+ continue;
436
+ }
437
+ if (!expectedSubjects.includes(parsedSubject.kind)) {
438
+ ctx.log?.("finding rejected: subject variant not allowed for this kind", {
439
+ kind: spec.id,
440
+ subject_kind: parsedSubject.kind,
441
+ subject: parsed.subject,
442
+ allowed: expectedSubjects
443
+ });
444
+ rejectedWrongKind += 1;
445
+ continue;
446
+ }
447
+ }
448
+ const postProcessed = spec.postProcess?.(parsed, ctx) ?? parsed;
449
+ if (!postProcessed) continue;
450
+ out.push(toAnalystFinding(spec, postProcessed));
451
+ }
452
+ ctx.log?.(`analyst.kind ${spec.id} done`, {
453
+ emitted: rawRows.length,
454
+ accepted: out.length,
455
+ rejected_wrong_subject: rejectedWrongKind
456
+ });
457
+ const report = typeof result.report === "string" ? result.report : "";
458
+ if (out.length === 0 && report.trim().length >= 200) {
459
+ if (opts.recovery) {
460
+ const recovered = await structureFindings({
461
+ report,
462
+ analystId: spec.id,
463
+ area: spec.area,
464
+ model: opts.recovery.model ?? opts.model ?? "",
465
+ baseUrl: opts.recovery.baseUrl,
466
+ apiKey: opts.recovery.apiKey,
467
+ fetchImpl: opts.recovery.fetchImpl
468
+ });
469
+ out.push(...recovered.findings);
470
+ ctx.log?.(`analyst.kind ${spec.id} recovery`, {
471
+ outcome: recovered.outcome,
472
+ recovered: recovered.findings.length
473
+ });
474
+ }
475
+ if (out.length === 0) {
476
+ out.push(
477
+ makeFinding({
478
+ analyst_id: spec.id,
479
+ area: spec.area,
480
+ claim: "Analyst produced a diagnosis but no structured findings \u2014 see report.",
481
+ rationale: report.slice(0, 1500),
482
+ severity: "info",
483
+ confidence: 0.3,
484
+ evidence_refs: [
485
+ { kind: "artifact", uri: "report://summary", excerpt: report.slice(0, 2e3) }
486
+ ],
487
+ metadata: { outcome: "extraction_failed" }
488
+ })
489
+ );
490
+ }
491
+ }
492
+ return out;
493
+ }
494
+ };
495
+ }
496
+ function deriveQuestion(ctx, spec) {
497
+ const focus = ctx.tags?.focus?.trim();
498
+ const task = `Analyze this trace dataset with the available tools and report ${spec.area} findings. ${spec.description}`;
499
+ return focus ? `${task} Focus: ${focus}.` : task;
500
+ }
501
+ function toAnalystFinding(spec, raw) {
502
+ return makeFinding({
503
+ analyst_id: spec.id,
504
+ area: spec.area,
505
+ subject: raw.subject,
506
+ claim: raw.claim,
507
+ rationale: raw.rationale,
508
+ severity: raw.severity,
509
+ confidence: raw.confidence,
510
+ evidence_refs: [
511
+ {
512
+ kind: evidenceKindFromUri(raw.evidence_uri),
513
+ uri: raw.evidence_uri,
514
+ excerpt: raw.evidence_excerpt
515
+ }
516
+ ],
517
+ recommended_action: raw.recommended_action,
518
+ metadata: { kind_version: spec.version }
519
+ });
520
+ }
521
+ function evidenceKindFromUri(uri) {
522
+ if (uri.startsWith("span://")) return "span";
523
+ if (uri.startsWith("artifact://")) return "artifact";
524
+ if (uri.startsWith("metric://")) return "metric";
525
+ if (uri.startsWith("event://")) return "event";
526
+ if (uri.startsWith("finding://")) return "finding";
527
+ return "artifact";
528
+ }
529
+ function renderPriorFindings(prior) {
530
+ if (!prior || prior.length === 0) return "";
531
+ const MAX_ROWS = 40;
532
+ const rows = prior.slice(0, MAX_ROWS).map((f) => {
533
+ const subject = f.subject ? ` [${f.subject}]` : "";
534
+ return ` - id=${f.finding_id} ${f.severity}${subject} ${truncateForContext(f.claim, 160)}`;
535
+ });
536
+ const overflow = prior.length > MAX_ROWS ? `
537
+ ... +${prior.length - MAX_ROWS} more prior findings (older history truncated)` : "";
538
+ return [
539
+ "",
540
+ "",
541
+ "PRIOR FINDINGS (from a previous run on related data):",
542
+ "When the work you do now matches a row below, REUSE the `finding_id` (pass it as `id_basis`) so the cross-run diff stays stable.",
543
+ "A finding that reappears with no remediation evidence SHOULD raise its `confidence` and may justify a higher `severity`.",
544
+ ...rows,
545
+ overflow
546
+ ].filter(Boolean).join("\n");
547
+ }
548
+ function truncateForContext(s, max) {
549
+ if (s.length <= max) return s;
550
+ return `${s.slice(0, max - 1).trimEnd()}\u2026`;
551
+ }
552
+
553
+ // src/analyst/tool-groups.ts
554
+ var TOOL_NAMES_BY_GROUP = {
555
+ all: /* @__PURE__ */ new Set(),
556
+ discovery: /* @__PURE__ */ new Set(["getDatasetOverview", "queryTraces", "countTraces"]),
557
+ discoveryAndRead: /* @__PURE__ */ new Set([
558
+ "getDatasetOverview",
559
+ "queryTraces",
560
+ "countTraces",
561
+ "viewTrace",
562
+ "viewSpans"
563
+ ]),
564
+ discoveryAndSearch: /* @__PURE__ */ new Set([
565
+ "getDatasetOverview",
566
+ "queryTraces",
567
+ "countTraces",
568
+ "searchTrace",
569
+ "searchSpan"
570
+ ]),
571
+ targeted: /* @__PURE__ */ new Set(["getDatasetOverview", "queryTraces", "viewSpans", "searchSpan"])
572
+ };
573
+ function buildTraceToolsForGroup(group, store) {
574
+ const all = buildTraceAnalystTools({ store });
575
+ if (group === "all") return all;
576
+ const allow = TOOL_NAMES_BY_GROUP[group];
577
+ if (!allow) throw new Error(`unknown trace tool group: ${group}`);
578
+ return all.filter((tool) => allow.has(tool.name));
579
+ }
580
+
581
+ // src/analyst/kinds/failure-mode.ts
582
+ var ACTOR_PROMPT = `You are a failure-mode classifier for an OTLP trace dataset. Your job is to identify the **distinct ways agents failed** in this dataset, not to grade individual runs.
583
+
584
+ DISCOVERY \u2192 CLUSTER \u2192 CITE protocol:
585
+
586
+ 1. Call \`traces.getDatasetOverview({})\` first. Use \`has_errors\`, \`models\`, \`agent_names\`, \`tools\`, and \`sample_trace_ids\` to size the failure surface.
587
+ 2. Use \`traces.queryTraces({ filters: { has_errors: true }, limit })\` to pull error-bearing traces. Combine with \`traces.countTraces\` to see what fraction of the dataset failed.
588
+ 3. For each candidate failure cluster, use \`traces.searchTrace\` with regex like \`STATUS_CODE_ERROR\`, \`MaxTurnsExceeded\`, \`assertion\`, \`unauthorized\`, \`timeout\`, \`429\`, \`5\\d\\d\`, the agent's specific error strings, or the names of its tools. Pull one or two representative traces per cluster, **not all** of them.
589
+ 4. **Cluster, do not enumerate.** Two failures with the same root cause should be ONE finding citing both traces, not two findings. The point of this analyst is to compress N runs into K modes.
590
+ 5. For each cluster you can defend with evidence, emit ONE finding with:
591
+ - \`area\` = "failure-mode"
592
+ - \`subject\` = a short label for the cluster ("tool-call-loop", "auth-revoked-mid-run", "agent-asked-clarification-too-late", ...)
593
+ - \`claim\` = one sentence stating the mode
594
+ - \`severity\` = "critical" when it blocks the run, "high" when the run finished degraded, "medium" when it slowed convergence
595
+ - \`evidence_uri\` = \`span://<trace_id>/<span_id>\` of the most representative span
596
+ - \`evidence_excerpt\` = the exact quote (e.g. error message, stuck tool call payload, contradictory turn output)
597
+ - \`confidence\` = 0.85+ when multiple traces show the same shape; 0.6-0.8 for a single-trace inference; <0.5 for speculative.
598
+ - \`recommended_action\` = imperative-phrased fix idea (kept short \u2014 the improvement-analyst will expand on these)
599
+
600
+ If the dataset has no failures, return an empty findings array \u2014 do NOT pad with low-confidence speculation.
601
+
602
+ **Delegate aggressively.** The recursion budget is there to be used:
603
+ - After your first \`getDatasetOverview\` + \`queryTraces\` calls, you should have 3-6 candidate failure clusters in mind. Spawn one \`llmQuery\` per cluster in a single batch \u2014 they investigate in parallel.
604
+ - A sub-investigator that finds its cluster is actually two distinct modes should split again at its own level. Recursion is meant to discover sub-modes, not to do trivial drilling that the parent could do in-line.
605
+ - Pass narrow context to each subagent: { question: 'investigate the auth-revoked-mid-run cluster', context: { trace_ids: ['abc', 'def'], suspected_root_cause: 'token refresh skipped on idle sessions' } }. Subagents need enough context to skip re-discovery but not the whole conversation.
606
+ - Each subagent returns its findings as JSON; the parent merges them. Do NOT have subagents call \`final()\` \u2014 they return their findings list to you, and you call \`final()\` once at the top.
607
+
608
+ OBSERVABILITY rules:
609
+ - Each non-final turn must emit at least one \`console.log\` for evidence.
610
+ - Reuse runtime variables across turns; don't recompute.
611
+ - Call \`final({ findings: [...] })\` exactly once, after you've gathered evidence for every cluster you intend to report.`;
612
+ var FAILURE_MODE_KIND_SPEC = {
613
+ id: "failure-mode",
614
+ description: "Clusters trace-dataset failures into distinct failure modes with cited evidence and a short recommended action.",
615
+ area: "failure-mode",
616
+ version: "1.0.0",
617
+ actorDescription: ACTOR_PROMPT,
618
+ buildTools: (store) => buildTraceToolsForGroup("all", store),
619
+ recursion: { maxDepth: 3, maxParallelSubagents: 4 },
620
+ maxTurns: 24,
621
+ cost: { kind: "llm" }
622
+ };
623
+
624
+ // src/analyst/kinds/improvement.ts
625
+ var ACTOR_PROMPT2 = `You are a recursive-self-improvement analyst. Your job is to propose **concrete, locus-named edits** the agent's runtime should adopt to fix the failure modes, knowledge gaps, and poisonings present in this dataset.
626
+
627
+ Upstream analysts have already classified the problems. Your job is to convert each problem into a *change to make* and grade its expected leverage. Each finding is one proposed edit.
628
+
629
+ DISCOVERY \u2192 CANDIDATE-FIXES \u2192 COMPETE \u2192 CITE protocol:
630
+
631
+ 1. \`traces.getDatasetOverview({})\` first. Note the agents, tools, and any system-prompt fingerprints (look for the prompt text echoed in early spans).
632
+ 2. For each high-severity failure pattern, generate 2-3 candidate fixes. Real candidate axes:
633
+ - **System-prompt edit** \u2014 add an instruction, remove a misleading one, restructure precedence
634
+ - **Tool description edit** \u2014 rewrite a tool's description so the agent picks it correctly / passes valid args
635
+ - **New tool** \u2014 add a tool the agent kept emulating in code
636
+ - **RAG ingestion** \u2014 add a document or correct a stale one
637
+ - **Memory invalidation** \u2014 clear cached prior-run decisions that no longer apply
638
+ - **Scaffolding** \u2014 add a precondition check, a retry policy, a turn budget, a verification step
639
+ - **Output schema** \u2014 narrow the agent's output to forbid the failure shape
640
+ 3. **Compete candidate fixes via subagents.** For each failure cluster, spawn one \`llmQuery\` per candidate-fix axis you want to evaluate. Each subagent's job: simulate the fix on the cited traces and report (i) likely effect, (ii) side effects, (iii) implementation cost as small/medium/large. Pass the cluster's failing trace_ids and the candidate axis as context.
641
+ 4. After subagents return, **pick the winning candidate per cluster** based on (effect / cost) and emit ONE finding. Discard the losing candidates \u2014 the output is the recommendation, not the candidate set.
642
+ 5. **Cross-reference upstream findings.** If a finding cites a prior failure-mode or knowledge-gap finding, use \`evidence_uri = "finding://<prior-finding-id>"\` (the registry supports this kind). This builds the dependency graph that lets the dashboard show "fix #X resolves failure modes A, B, C."
643
+
644
+ For each winning recommendation, emit ONE finding with:
645
+ - \`area\` = "improvement"
646
+ - \`subject\` = the locus to edit: \`system-prompt:<section>\`, \`tool-doc:<tool-name>\`, \`new-tool:<proposed-name>\`, \`rag:<corpus>:<doc-id>\`, \`memory:<key>\`, \`scaffolding:<concern>\`, \`output-schema:<field>\`
647
+ - \`claim\` = one sentence stating the edit ("Add a precondition check to refuse tool X calls without arg Y")
648
+ - \`severity\` = leverage rating: "critical" when fix resolves a critical failure mode; "high" when it resolves a high; "medium" when it's a quality-of-life win; "info" when it's a cleanup with no behavioral effect
649
+ - \`evidence_uri\` = the failure-mode finding id this fix targets (\`finding://<id>\`) when it exists; else the most representative span
650
+ - \`evidence_excerpt\` = a fragment showing the problem the fix targets
651
+ - \`confidence\` = 0.85+ when the fix is mechanical and the failure mode is well-evidenced; 0.6-0.8 when the fix requires judgment; <0.5 for speculative
652
+ - \`rationale\` = why this candidate beat its alternatives (2 sentences max)
653
+ - \`recommended_action\` = the **literal edit**, phrased as a diff or a quoted replacement: "Replace section X with: '...'" or "Add tool with description: '...'" or "Set retry policy to max_attempts=3 with exponential backoff"
654
+
655
+ If no upstream failure findings exist in this run, derive your own from the trace dataset using the failure-mode protocol inline (\`searchTrace\` for STATUS_CODE_ERROR / MaxTurnsExceeded / etc.). But prefer to consume upstream findings when present \u2014 the kinds are designed to chain.
656
+
657
+ Do NOT propose a fix you cannot defend with evidence. "Tighten the prompt" is not a finding; "Add 'When the user asks for X, always Y' to the system prompt section "request-classification"" is.
658
+
659
+ OBSERVABILITY rules:
660
+ - Each non-final turn must emit at least one \`console.log\` for evidence.
661
+ - Call \`final({ findings: [...] })\` exactly once at the top level.`;
662
+ var IMPROVEMENT_KIND_SPEC = {
663
+ id: "improvement",
664
+ description: "Converts upstream failure / gap / poisoning findings into concrete locus-named edits (prompt, tool-doc, RAG, scaffolding) with leverage grades.",
665
+ area: "improvement",
666
+ version: "1.0.0",
667
+ actorDescription: ACTOR_PROMPT2,
668
+ buildTools: (store) => buildTraceToolsForGroup("all", store),
669
+ recursion: { maxDepth: 3, maxParallelSubagents: 4 },
670
+ maxTurns: 30,
671
+ maxRuntimeChars: 12e3,
672
+ cost: { kind: "llm" }
673
+ };
674
+
675
+ // src/analyst/kinds/knowledge-gap.ts
676
+ var ACTOR_PROMPT3 = `You are a knowledge-gap analyst for an OTLP trace dataset. Your job is to identify the **specific pieces of information the agent lacked, or that were stale**, that caused poor decisions.
677
+
678
+ The agent under analysis maintains a curated knowledge base via \`@tangle-network/agent-knowledge\` \u2014 a wiki of \`KnowledgePage\`s with raw source anchors, claims, and relations. The primary expected store of agent-knowable facts IS that wiki. A "knowledge gap" is anything the agent had to discover or guess at run-time that the wiki should have held \u2014 or an outdated/contradictory fact the agent picked up from a non-wiki source.
679
+
680
+ DISCOVERY \u2192 ATTRIBUTE-TO-LAYER \u2192 CITE protocol:
681
+
682
+ 1. \`traces.getDatasetOverview({})\` first. Note which agents, tools, and models appear.
683
+ 2. Pull traces where the agent shows gap signals. The strongest signals are:
684
+ - Self-correction turns ("I assumed X but\u2026", "let me re-check", "actually,")
685
+ - Clarifying-question turns where the agent asked the user something the runtime should have surfaced
686
+ - Repeated retrieval / lookup calls for the same artifact with slightly varied queries
687
+ - Tool errors that name a missing argument or unknown resource
688
+ - Web-search calls returning pages dated before a known cutoff for content that changes (versioned APIs, schemas, policies)
689
+ - Agent quoting a tool's docs / system prompt incorrectly because the actual text was insufficient
690
+ - Fabricated identifiers that don't appear in dataset \`sample_trace_ids\`
691
+ Use \`traces.searchTrace\` with patterns like \`I (don.?t|do not) know\`, \`assumed\`, \`unclear\`, \`could you (clarify|tell me|provide)\`, \`not found\`, \`undefined\`, \`unknown\`, \`null\`, dates older than the analysis window, or the agent's specific clarification phrases.
692
+ 3. For each gap, identify the **layer of the runtime that should have prevented it**. The locus is the value of \`subject\` on the finding. Use one of:
693
+ - \`agent-knowledge:wiki:<page-slug>\` \u2014 the wiki page that should exist but doesn't, or exists but lacks the claim
694
+ - \`agent-knowledge:wiki:<page-slug>#<heading>\` \u2014 wiki page exists but a specific section is missing
695
+ - \`agent-knowledge:claim:<topic>\` \u2014 a specific claim/relation triple that should be in the wiki
696
+ - \`agent-knowledge:raw:<source-id>\` \u2014 raw source captured but never lifted into a curated page
697
+ - \`agent-knowledge:stale:<page-slug>\` \u2014 wiki page exists but contradicts ground-truth evidence in this trace (the wiki itself drifted)
698
+ - \`websearch:outdated:<topic>\` \u2014 agent relied on a web result that was stale; wiki should have superseded it
699
+ - \`tool-doc:<tool-name>:<aspect>\` \u2014 tool description missed a behavior aspect (return shape, failure modes, side effects)
700
+ - \`system-prompt:<section>\` \u2014 system prompt should have stated the rule directly
701
+ - \`memory:<key>\` \u2014 prior-run memory should have surfaced an earlier decision
702
+ 4. For each gap you can defend with evidence, emit ONE finding with:
703
+ - \`area\` = "knowledge-gap"
704
+ - \`subject\` = the locus string from the list above
705
+ - \`claim\` = a sentence naming the missing or stale knowledge ("wiki has no page on invoice line-item shape, agent had to re-derive it from raw spans")
706
+ - \`severity\` = "high" when the gap caused a failure or a clarifying question; "medium" when it caused unnecessary turns; "low" when it caused minor inefficiency
707
+ - \`evidence_uri\` = \`span://<trace_id>/<span_id>\` of the moment the gap surfaced (the question, the self-correction, the retrieval miss, the stale web result)
708
+ - \`evidence_excerpt\` = exact quote where the agent showed the gap
709
+ - \`confidence\` = 0.85+ when the agent itself articulated the gap; 0.6-0.8 when inferred from behavior
710
+ - \`recommended_action\` = phrased as a wiki edit when the locus is \`agent-knowledge:*\` ("Create wiki page \`invoice-line-items\` with claims: ..."), or as a prompt/tool-doc edit otherwise
711
+
712
+ **Delegate per layer.** After your first scan, you should have candidates spread across \`agent-knowledge:*\`, \`websearch:outdated\`, \`tool-doc:*\`, \`system-prompt:*\`, and \`memory:*\`. Spawn one \`llmQuery\` per layer in parallel \u2014 each subagent runs a focused detection (e.g. the \`agent-knowledge\` subagent looks for both missing-pages AND stale-pages; the \`websearch\` subagent looks specifically for date staleness signals; the \`tool-doc\` subagent looks for tool-call argument errors a fuller description would have prevented). Subagents return findings; you merge and emit one \`final({ findings })\` at the top.
713
+
714
+ Do NOT report a gap that the agent later recovered from cleanly within the same turn \u2014 that's resilience, not a gap. Cite the *non-recovery* version when both exist.
715
+
716
+ OBSERVABILITY rules:
717
+ - Each non-final turn must emit at least one \`console.log\` for evidence.
718
+ - Call \`final({ findings: [...] })\` exactly once at the top level.`;
719
+ var KNOWLEDGE_GAP_KIND_SPEC = {
720
+ id: "knowledge-gap",
721
+ description: "Identifies missing or stale pieces of knowledge \u2014 primarily against the agent-knowledge wiki \u2014 and attributes each to the runtime layer (wiki page, claim, raw source, websearch, tool-doc, system-prompt, memory) that should have held it.",
722
+ area: "knowledge-gap",
723
+ version: "1.0.0",
724
+ actorDescription: ACTOR_PROMPT3,
725
+ buildTools: (store) => buildTraceToolsForGroup("discoveryAndSearch", store),
726
+ recursion: { maxDepth: 2, maxParallelSubagents: 4 },
727
+ maxTurns: 18,
728
+ cost: { kind: "llm" }
729
+ };
730
+
731
+ // src/analyst/kinds/knowledge-poisoning.ts
732
+ var ACTOR_PROMPT4 = `You are a knowledge-poisoning analyst for an OTLP trace dataset. Your job is to identify cases where the agent **confidently used wrong information** \u2014 not where it lacked information (that's the knowledge-gap analyst).
733
+
734
+ DISCOVERY \u2192 DUAL-VERIFY \u2192 CITE protocol:
735
+
736
+ 1. \`traces.getDatasetOverview({})\` first. Identify the agents, models, and tools.
737
+ 2. Pull traces where the agent's confident action was later contradicted. Strongest signals:
738
+ - Agent stated a fact in one span; a later span surfaced contradictory evidence; the agent then proceeded anyway or fabricated reconciliation.
739
+ - Tool call with stale arguments (an id that no longer exists, an API shape that changed).
740
+ - Agent cited an \`agent-knowledge\` wiki page or claim whose content contradicts the trace's own evidence \u2014 the wiki itself drifted.
741
+ - Web-search result the agent cited that returned an outdated page; agent treated it as canonical.
742
+ - System-prompt instruction the agent followed that ground-truth evidence in the trace contradicts (e.g. prompt says "use endpoint A"; tool reply says "endpoint A deprecated, use B").
743
+ - Repeated wrong-shape parsing despite the tool's actual output proving the shape.
744
+ 3. Use \`traces.searchTrace\` with regex on phrases like \`actually\`, \`turns out\`, \`previously assumed\`, \`old version\`, \`deprecated\`, \`updated to\`, \`now uses\`, or specific entity names you suspect have changed.
745
+ 4. For each candidate poisoning, **DUAL-VERIFY**:
746
+ - Confirm the agent actually acted on the false belief (cite the span where it did)
747
+ - Confirm the belief is actually false in this trace's own evidence (cite the span that contradicts it)
748
+ Only emit a finding when both halves are nailed down. If you can only nail one, drop it \u2014 single-evidence poisoning findings are too speculative to be useful.
749
+
750
+ **Delegate the dual-verify.** Use the recursion budget so each candidate poisoning gets one subagent investigating "did the agent act?" and one investigating "is the belief false?". After your first scan, fire off N parallel \`llmQuery\` pairs (one cluster per pair). Subagents return their findings; you accept only the ones where BOTH halves of the pair were confirmed.
751
+
752
+ For each confirmed poisoning, emit ONE finding with:
753
+ - \`area\` = "knowledge-poisoning"
754
+ - \`subject\` = the source of the false belief, one of: \`agent-knowledge:wiki:<page-slug>\` (wiki page contradicts current ground truth), \`agent-knowledge:claim:<topic>\` (a specific claim/relation went stale), \`agent-knowledge:raw:<source-id>\` (the raw source is outdated and the wiki inherited the drift), \`websearch:outdated:<url-or-topic>\`, \`tool-doc:<tool>\`, \`system-prompt:<section>\`, \`memory:<key>\`, \`prior-run-summary:<topic>\`
755
+ - \`claim\` = one sentence: "agent believed X (from source S); evidence in trace shows X is false"
756
+ - \`severity\` = "critical" when poisoning caused a wrong user-visible action; "high" when caught internally but wasted significant work; "medium" for inefficiency only
757
+ - \`evidence_uri\` = \`span://<trace_id>/<span_id>\` of the action span (the moment the agent acted on the false belief)
758
+ - \`evidence_excerpt\` = exact quote of the confident-but-wrong claim or action
759
+ - \`confidence\` = 0.85+ when both halves are exact-quote backed; 0.6-0.8 when one half is inferred
760
+ - \`recommended_action\` = where the source should be updated and how ("Update wiki page \`X\` claim \`Y\` to '...'", "Invalidate raw source \`Z\` and re-curate", "Replace system-prompt section X with 'tool foo now returns Y'")
761
+
762
+ Do NOT report a finding if the agent caught and corrected the false belief in the same turn \u2014 that's the system working. Reserve poisoning for cases where the false belief shaped downstream action.
763
+
764
+ OBSERVABILITY rules:
765
+ - Each non-final turn must emit at least one \`console.log\` for evidence.
766
+ - Call \`final({ findings: [...] })\` exactly once at the top level.`;
767
+ var KNOWLEDGE_POISONING_KIND_SPEC = {
768
+ id: "knowledge-poisoning",
769
+ description: "Identifies confident-but-wrong actions caused by stale memory, contradicting RAG, deprecated tool docs, or outdated system-prompt instructions.",
770
+ area: "knowledge-poisoning",
771
+ version: "1.0.0",
772
+ actorDescription: ACTOR_PROMPT4,
773
+ buildTools: (store) => buildTraceToolsForGroup("all", store),
774
+ recursion: { maxDepth: 2, maxParallelSubagents: 4 },
775
+ maxTurns: 20,
776
+ cost: { kind: "llm" }
777
+ };
778
+
779
+ // src/analyst/kinds/index.ts
780
+ var DEFAULT_TRACE_ANALYST_KINDS = [
781
+ FAILURE_MODE_KIND_SPEC,
782
+ KNOWLEDGE_GAP_KIND_SPEC,
783
+ KNOWLEDGE_POISONING_KIND_SPEC,
784
+ IMPROVEMENT_KIND_SPEC
785
+ ];
786
+
787
+ // src/analyst/registry.ts
788
+ import { randomUUID } from "crypto";
789
+ var AnalystRegistry = class {
790
+ analysts = /* @__PURE__ */ new Map();
791
+ options;
792
+ constructor(options = {}) {
793
+ this.options = options;
794
+ }
795
+ register(analyst) {
796
+ if (!analyst.id) throw new Error("AnalystRegistry.register: analyst.id is required");
797
+ if (this.analysts.has(analyst.id)) {
798
+ throw new Error(`AnalystRegistry.register: duplicate analyst id "${analyst.id}"`);
799
+ }
800
+ if (!analyst.version) {
801
+ throw new Error(`AnalystRegistry.register: analyst "${analyst.id}" must declare a version`);
802
+ }
803
+ this.analysts.set(analyst.id, analyst);
804
+ }
805
+ list() {
806
+ return Array.from(this.analysts.values()).map((a) => ({
807
+ id: a.id,
808
+ description: a.description,
809
+ version: a.version,
810
+ cost: a.cost
811
+ }));
812
+ }
813
+ async run(runId, inputs, runOpts = {}) {
814
+ for await (const ev of this.runStream(runId, inputs, runOpts)) {
815
+ if (ev.type === "run-completed") return ev.result;
816
+ }
817
+ throw new Error("AnalystRegistry.run: stream completed without run-completed event");
818
+ }
819
+ /**
820
+ * Streaming counterpart to `run()`. Emits `AnalystRunEvent` values
821
+ * in real time — `run-started`, then per-analyst `skipped` /
822
+ * `started` / `completed`, then a terminal `run-completed` whose
823
+ * payload is the full `AnalystRunResult`. UIs use this to render
824
+ * progress; persistence consumers use `run()` and read the result.
825
+ *
826
+ * Hooks (`onBeforeAnalyze` / `onAfterAnalyze` / `onError` /
827
+ * `onComplete`) fire as before — streaming is additive, not a hook
828
+ * replacement.
829
+ */
830
+ async *runStream(runId, inputs, runOpts = {}) {
831
+ const correlationId = `ar_${randomUUID().slice(0, 12)}`;
832
+ const log = this.options.log ?? (() => {
833
+ });
834
+ const hooks = this.options.hooks ?? {};
835
+ const startedAt = (/* @__PURE__ */ new Date()).toISOString();
836
+ const started = Date.now();
837
+ const deadlineMs = runOpts.timeoutMs ? started + runOpts.timeoutMs : void 0;
838
+ const selected = this.selectAnalysts(runOpts);
839
+ const budget = runOpts.budget ?? this.options.defaultBudget;
840
+ yield {
841
+ type: "run-started",
842
+ run_id: runId,
843
+ correlation_id: correlationId,
844
+ started_at: startedAt,
845
+ analyst_ids: selected.map((a) => a.id)
846
+ };
847
+ const summaries = [];
848
+ const allFindings = [];
849
+ let totalCost = 0;
850
+ let remainingUsd = budget?.totalUsd;
851
+ for (const analyst of selected) {
852
+ const t0 = Date.now();
853
+ const input = this.routeInput(analyst, inputs);
854
+ if (input.kind === "missing") {
855
+ const summary = {
856
+ analyst_id: analyst.id,
857
+ status: "skipped",
858
+ reason: `missing input of kind '${analyst.inputKind}'`,
859
+ findings_count: 0,
860
+ latency_ms: 0,
861
+ cost_usd: 0
862
+ };
863
+ summaries.push(summary);
864
+ log(`[analyst] skip ${analyst.id} \u2014 missing input`, { runId, kind: analyst.inputKind });
865
+ await hooks.onAfterAnalyze?.({ analyst, summary, findings: [], runId });
866
+ yield { type: "analyst-skipped", summary };
867
+ continue;
868
+ }
869
+ const perBudget = allocateBudget(budget, {
870
+ analyst,
871
+ remainingUsd,
872
+ runningCount: selected.length
873
+ });
874
+ const ctx = {
875
+ runId,
876
+ correlationId,
877
+ deadlineMs,
878
+ budgetUsd: perBudget,
879
+ chat: this.options.chat,
880
+ tags: runOpts.tags,
881
+ log: (msg, fields) => log(`[${analyst.id}] ${msg}`, { runId, correlationId, ...fields }),
882
+ signal: runOpts.signal,
883
+ priorFindings: selectPriorFindings(runOpts.priorFindings, analyst.id)
884
+ };
885
+ await hooks.onBeforeAnalyze?.({ analyst, ctx, runId });
886
+ yield {
887
+ type: "analyst-started",
888
+ analyst_id: analyst.id,
889
+ started_at: new Date(t0).toISOString()
890
+ };
891
+ try {
892
+ const findings = await analyst.analyze(input.value, ctx);
893
+ const latency = Date.now() - t0;
894
+ const cost = sumFindingCost(findings);
895
+ totalCost += cost;
896
+ if (typeof remainingUsd === "number") remainingUsd = Math.max(0, remainingUsd - cost);
897
+ allFindings.push(...findings);
898
+ const summary = {
899
+ analyst_id: analyst.id,
900
+ status: "ok",
901
+ findings_count: findings.length,
902
+ latency_ms: latency,
903
+ cost_usd: cost
904
+ };
905
+ summaries.push(summary);
906
+ log(`[analyst] ok ${analyst.id}`, {
907
+ runId,
908
+ findings: findings.length,
909
+ latency_ms: latency,
910
+ cost_usd: cost
911
+ });
912
+ await hooks.onAfterAnalyze?.({ analyst, summary, findings, runId });
913
+ yield { type: "analyst-completed", summary, findings };
914
+ } catch (err) {
915
+ const latency = Date.now() - t0;
916
+ const e = err instanceof Error ? err : new Error(String(err));
917
+ const hookFindings = await hooks.onError?.({ analyst, error: e, runId }) ?? [];
918
+ if (hookFindings.length) allFindings.push(...hookFindings);
919
+ const summary = {
920
+ analyst_id: analyst.id,
921
+ status: "failed",
922
+ findings_count: hookFindings.length,
923
+ latency_ms: latency,
924
+ cost_usd: 0,
925
+ error: { class: e.constructor.name, message: e.message }
926
+ };
927
+ summaries.push(summary);
928
+ log(`[analyst] FAIL ${analyst.id}`, {
929
+ runId,
930
+ error_class: e.constructor.name,
931
+ error: e.message
932
+ });
933
+ await hooks.onAfterAnalyze?.({ analyst, summary, findings: hookFindings, runId });
934
+ yield { type: "analyst-completed", summary, findings: hookFindings };
935
+ }
936
+ }
937
+ const result = {
938
+ run_id: runId,
939
+ correlation_id: correlationId,
940
+ started_at: startedAt,
941
+ ended_at: (/* @__PURE__ */ new Date()).toISOString(),
942
+ findings: allFindings,
943
+ per_analyst: summaries,
944
+ total_cost_usd: totalCost
945
+ };
946
+ await hooks.onComplete?.({ result });
947
+ yield { type: "run-completed", result };
948
+ }
949
+ selectAnalysts(opts) {
950
+ let candidates = Array.from(this.analysts.values());
951
+ if (opts.only?.length) {
952
+ const only = new Set(opts.only);
953
+ candidates = candidates.filter((a) => only.has(a.id));
954
+ }
955
+ if (opts.skip?.length) {
956
+ const skip = new Set(opts.skip);
957
+ candidates = candidates.filter((a) => !skip.has(a.id));
958
+ }
959
+ return candidates;
960
+ }
961
+ routeInput(analyst, inputs) {
962
+ switch (analyst.inputKind) {
963
+ case "trace-store":
964
+ return inputs.traceStore ? { kind: "present", value: inputs.traceStore } : { kind: "missing" };
965
+ case "artifact-dir":
966
+ return inputs.artifactDir ? { kind: "present", value: inputs.artifactDir } : { kind: "missing" };
967
+ case "run-record":
968
+ return inputs.runRecord ? { kind: "present", value: inputs.runRecord } : { kind: "missing" };
969
+ case "judge-input":
970
+ return inputs.judgeInput ? { kind: "present", value: inputs.judgeInput } : { kind: "missing" };
971
+ case "custom": {
972
+ const v = inputs.custom?.[analyst.id];
973
+ return v !== void 0 ? { kind: "present", value: v } : { kind: "missing" };
974
+ }
975
+ }
976
+ }
977
+ };
978
+ function allocateBudget(policy, args) {
979
+ if (!policy) return void 0;
980
+ if (policy.allocate) {
981
+ return policy.allocate({
982
+ analyst: args.analyst,
983
+ totalUsd: policy.totalUsd,
984
+ remainingUsd: args.remainingUsd,
985
+ runningCount: args.runningCount
986
+ });
987
+ }
988
+ if (policy.totalUsd == null) return void 0;
989
+ if (policy.weights) {
990
+ const w = policy.weights[args.analyst.id] ?? 1;
991
+ const totalWeight = Math.max(1, args.runningCount);
992
+ return policy.totalUsd * w / totalWeight;
993
+ }
994
+ return policy.totalUsd / Math.max(1, args.runningCount);
995
+ }
996
+ function sumFindingCost(findings) {
997
+ let sum = 0;
998
+ for (const f of findings) {
999
+ const c = f.metadata?.cost_usd;
1000
+ if (typeof c === "number" && Number.isFinite(c)) sum += c;
1001
+ }
1002
+ return sum;
1003
+ }
1004
+ function selectPriorFindings(source, analystId) {
1005
+ if (!source) return void 0;
1006
+ if (Array.isArray(source)) {
1007
+ const own2 = source.filter((f) => f.analyst_id === analystId);
1008
+ return own2.length > 0 ? own2 : void 0;
1009
+ }
1010
+ const record = source;
1011
+ const own = record[analystId] ?? [];
1012
+ const wildcard = record["*"] ?? [];
1013
+ const merged = [...own, ...wildcard];
1014
+ return merged.length > 0 ? merged : void 0;
1015
+ }
1016
+
1017
+ export {
1018
+ computeFindingId,
1019
+ makeFinding,
1020
+ FINDING_SUBJECT_KINDS,
1021
+ parseFindingSubject,
1022
+ renderFindingSubject,
1023
+ FINDING_SUBJECT_GRAMMAR_PROMPT,
1024
+ KIND_EXPECTED_SUBJECTS,
1025
+ FindingSubjectStringSchema,
1026
+ stripCodeFences,
1027
+ coerceJson,
1028
+ coerceToFindingRows,
1029
+ ANALYST_SEVERITIES,
1030
+ RawAnalystFindingSchema,
1031
+ RAW_FINDING_SCHEMA_PROMPT,
1032
+ parseRawFinding,
1033
+ structureFindings,
1034
+ createTraceAnalystKind,
1035
+ renderPriorFindings,
1036
+ buildTraceToolsForGroup,
1037
+ FAILURE_MODE_KIND_SPEC,
1038
+ IMPROVEMENT_KIND_SPEC,
1039
+ KNOWLEDGE_GAP_KIND_SPEC,
1040
+ KNOWLEDGE_POISONING_KIND_SPEC,
1041
+ DEFAULT_TRACE_ANALYST_KINDS,
1042
+ AnalystRegistry
1043
+ };
1044
+ //# sourceMappingURL=chunk-WYIHD6EB.js.map