npm - @tangle-network/agent-eval - Versions diffs - 0.28.0 → 0.29.0 - Mend

@tangle-network/agent-eval 0.28.0 → 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/dist/index.js CHANGED Viewed

@@ -967,6 +967,45 @@ function resolveModel(req, defaultModel) {
   return { ...req, model: defaultModel };
 }
+// src/analyst/finding-signature.ts
+import { z } from "zod";
+var ANALYST_SEVERITIES = ["critical", "high", "medium", "low", "info"];
+var RawAnalystFindingSchema = z.object({
+  severity: z.enum(ANALYST_SEVERITIES),
+  claim: z.string().min(1).max(2e3),
+  subject: z.string().max(400).optional(),
+  evidence_uri: z.string().min(1).max(2e3),
+  evidence_excerpt: z.string().max(2e3).optional(),
+  confidence: z.number().min(0).max(1),
+  rationale: z.string().max(4e3).optional(),
+  recommended_action: z.string().max(2e3).optional()
+}).strict();
+var RAW_FINDING_SCHEMA_PROMPT = `Each finding MUST be a JSON object with these fields:
+  - severity: one of "critical" | "high" | "medium" | "low" | "info"
+  - claim: one-sentence statement (max 2000 chars)
+  - subject?: the leaf id, agent id, span id, tool name, or noun phrase the finding is about
+  - evidence_uri: "span://<trace_id>/<span_id>" for trace evidence, "artifact://<relative-path>" for files, "metric://<name>" for named scalars \u2014 ALWAYS cite a real id surfaced by the tools
+  - evidence_excerpt?: short quote (<=2000 chars) from the cited span/artifact
+  - confidence: number 0..1 \u2014 0.9+ when backed by exact quotes, 0.6-0.8 for inferred patterns, <0.5 for speculative
+  - rationale?: one or two sentences explaining the reasoning
+  - recommended_action?: concrete change phrased as an imperative ("Add ...", "Replace ...", "Stop ...") \u2014 omit when the finding is purely descriptive
+Emit an empty array when the question has no findings to report. Do not fabricate evidence.`;
+function parseRawFinding(row, log) {
+  const result = RawAnalystFindingSchema.safeParse(row);
+  if (!result.success) {
+    log?.("finding rejected: schema failure", {
+      issues: result.error.issues.map((i) => ({
+        path: i.path.join("."),
+        code: i.code,
+        message: i.message
+      }))
+    });
+    return null;
+  }
+  return result.data;
+}
 // src/analyst/findings-store.ts
 import { existsSync as existsSync2, readFileSync } from "fs";
@@ -1113,43 +1152,8 @@ function diffFindings(previous, current, policy = {}) {
   return { appeared, disappeared, persisted, changed };
 }
-// src/analyst/finding-signature.ts
-import { z } from "zod";
-var ANALYST_SEVERITIES = ["critical", "high", "medium", "low", "info"];
-var RawAnalystFindingSchema = z.object({
-  severity: z.enum(ANALYST_SEVERITIES),
-  claim: z.string().min(1).max(2e3),
-  subject: z.string().max(400).optional(),
-  evidence_uri: z.string().min(1).max(2e3),
-  evidence_excerpt: z.string().max(2e3).optional(),
-  confidence: z.number().min(0).max(1),
-  rationale: z.string().max(4e3).optional(),
-  recommended_action: z.string().max(2e3).optional()
-}).strict();
-var RAW_FINDING_SCHEMA_PROMPT = `Each finding MUST be a JSON object with these fields:
-  - severity: one of "critical" | "high" | "medium" | "low" | "info"
-  - claim: one-sentence statement (max 2000 chars)
-  - subject?: the leaf id, agent id, span id, tool name, or noun phrase the finding is about
-  - evidence_uri: "span://<trace_id>/<span_id>" for trace evidence, "artifact://<relative-path>" for files, "metric://<name>" for named scalars \u2014 ALWAYS cite a real id surfaced by the tools
-  - evidence_excerpt?: short quote (<=2000 chars) from the cited span/artifact
-  - confidence: number 0..1 \u2014 0.9+ when backed by exact quotes, 0.6-0.8 for inferred patterns, <0.5 for speculative
-  - rationale?: one or two sentences explaining the reasoning
-  - recommended_action?: concrete change phrased as an imperative ("Add ...", "Replace ...", "Stop ...") \u2014 omit when the finding is purely descriptive
-Emit an empty array when the question has no findings to report. Do not fabricate evidence.`;
-function parseRawFinding(row, log) {
-  const result = RawAnalystFindingSchema.safeParse(row);
-  if (!result.success) {
-    log?.("finding rejected: schema failure", {
-      issues: result.error.issues.map((i) => ({ path: i.path.join("."), code: i.code, message: i.message }))
-    });
-    return null;
-  }
-  return result.data;
-}
 // src/analyst/kind-factory.ts
-import { agent, AxJSRuntime } from "@ax-llm/ax";
+import { AxJSRuntime, agent } from "@ax-llm/ax";
 function createTraceAnalystKind(spec, opts) {
   const version = opts.versionSuffix ? `${spec.version}+${opts.versionSuffix}` : spec.version;
   return {
@@ -1162,7 +1166,8 @@ function createTraceAnalystKind(spec, opts) {
       const tools = spec.buildTools(store);
       const maxDepth = spec.recursion?.maxDepth ?? 0;
       const maxParallel = spec.recursion?.maxParallelSubagents ?? 2;
-      const actorDescription = spec.actorDescription.trim() + "\n\n" + RAW_FINDING_SCHEMA_PROMPT + "\n\nReturn the array in the `findings` output field. Use `final(...)` with the structured `{ findings }` payload when you are done.";
+      const priorContext = renderPriorFindings(ctx.priorFindings);
+      const actorDescription = spec.actorDescription.trim() + priorContext + "\n\n" + RAW_FINDING_SCHEMA_PROMPT + "\n\nReturn the array in the `findings` output field. Use `final(...)` with the structured `{ findings }` payload when you are done.";
       const ax = agent(
         "question:string -> findings:json[]",
         {
@@ -1257,12 +1262,41 @@ function evidenceKindFromUri(uri) {
   if (uri.startsWith("finding://")) return "finding";
   return "artifact";
 }
+function renderPriorFindings(prior) {
+  if (!prior || prior.length === 0) return "";
+  const MAX_ROWS = 40;
+  const rows = prior.slice(0, MAX_ROWS).map((f) => {
+    const subject = f.subject ? ` [${f.subject}]` : "";
+    return `  - id=${f.finding_id} ${f.severity}${subject} ${truncateForContext(f.claim, 160)}`;
+  });
+  const overflow = prior.length > MAX_ROWS ? `
+  ... +${prior.length - MAX_ROWS} more prior findings (older history truncated)` : "";
+  return [
+    "",
+    "",
+    "PRIOR FINDINGS (from a previous run on related data):",
+    "When the work you do now matches a row below, REUSE the `finding_id` (pass it as `id_basis`) so the cross-run diff stays stable.",
+    "A finding that reappears with no remediation evidence SHOULD raise its `confidence` and may justify a higher `severity`.",
+    ...rows,
+    overflow
+  ].filter(Boolean).join("\n");
+}
+function truncateForContext(s, max) {
+  if (s.length <= max) return s;
+  return `${s.slice(0, max - 1).trimEnd()}\u2026`;
+}
 // src/analyst/tool-groups.ts
 var TOOL_NAMES_BY_GROUP = {
   all: /* @__PURE__ */ new Set(),
   discovery: /* @__PURE__ */ new Set(["getDatasetOverview", "queryTraces", "countTraces"]),
-  discoveryAndRead: /* @__PURE__ */ new Set(["getDatasetOverview", "queryTraces", "countTraces", "viewTrace", "viewSpans"]),
+  discoveryAndRead: /* @__PURE__ */ new Set([
+    "getDatasetOverview",
+    "queryTraces",
+    "countTraces",
+    "viewTrace",
+    "viewSpans"
+  ]),
   discoveryAndSearch: /* @__PURE__ */ new Set([
     "getDatasetOverview",
     "queryTraces",
@@ -1323,8 +1357,59 @@ var FAILURE_MODE_KIND_SPEC = {
   cost: { kind: "llm" }
 };
+// src/analyst/kinds/improvement.ts
+var ACTOR_PROMPT2 = `You are a recursive-self-improvement analyst. Your job is to propose **concrete, locus-named edits** the agent's runtime should adopt to fix the failure modes, knowledge gaps, and poisonings present in this dataset.
+Upstream analysts have already classified the problems. Your job is to convert each problem into a *change to make* and grade its expected leverage. Each finding is one proposed edit.
+DISCOVERY \u2192 CANDIDATE-FIXES \u2192 COMPETE \u2192 CITE protocol:
+1. \`traces.getDatasetOverview({})\` first. Note the agents, tools, and any system-prompt fingerprints (look for the prompt text echoed in early spans).
+2. For each high-severity failure pattern, generate 2-3 candidate fixes. Real candidate axes:
+   - **System-prompt edit** \u2014 add an instruction, remove a misleading one, restructure precedence
+   - **Tool description edit** \u2014 rewrite a tool's description so the agent picks it correctly / passes valid args
+   - **New tool** \u2014 add a tool the agent kept emulating in code
+   - **RAG ingestion** \u2014 add a document or correct a stale one
+   - **Memory invalidation** \u2014 clear cached prior-run decisions that no longer apply
+   - **Scaffolding** \u2014 add a precondition check, a retry policy, a turn budget, a verification step
+   - **Output schema** \u2014 narrow the agent's output to forbid the failure shape
+3. **Compete candidate fixes via subagents.** For each failure cluster, spawn one \`llmQuery\` per candidate-fix axis you want to evaluate. Each subagent's job: simulate the fix on the cited traces and report (i) likely effect, (ii) side effects, (iii) implementation cost as small/medium/large. Pass the cluster's failing trace_ids and the candidate axis as context.
+4. After subagents return, **pick the winning candidate per cluster** based on (effect / cost) and emit ONE finding. Discard the losing candidates \u2014 the output is the recommendation, not the candidate set.
+5. **Cross-reference upstream findings.** If a finding cites a prior failure-mode or knowledge-gap finding, use \`evidence_uri = "finding://<prior-finding-id>"\` (the registry supports this kind). This builds the dependency graph that lets the dashboard show "fix #X resolves failure modes A, B, C."
+For each winning recommendation, emit ONE finding with:
+- \`area\` = "improvement"
+- \`subject\` = the locus to edit: \`system-prompt:<section>\`, \`tool-doc:<tool-name>\`, \`new-tool:<proposed-name>\`, \`rag:<corpus>:<doc-id>\`, \`memory:<key>\`, \`scaffolding:<concern>\`, \`output-schema:<field>\`
+- \`claim\` = one sentence stating the edit ("Add a precondition check to refuse tool X calls without arg Y")
+- \`severity\` = leverage rating: "critical" when fix resolves a critical failure mode; "high" when it resolves a high; "medium" when it's a quality-of-life win; "info" when it's a cleanup with no behavioral effect
+- \`evidence_uri\` = the failure-mode finding id this fix targets (\`finding://<id>\`) when it exists; else the most representative span
+- \`evidence_excerpt\` = a fragment showing the problem the fix targets
+- \`confidence\` = 0.85+ when the fix is mechanical and the failure mode is well-evidenced; 0.6-0.8 when the fix requires judgment; <0.5 for speculative
+- \`rationale\` = why this candidate beat its alternatives (2 sentences max)
+- \`recommended_action\` = the **literal edit**, phrased as a diff or a quoted replacement: "Replace section X with: '...'" or "Add tool with description: '...'" or "Set retry policy to max_attempts=3 with exponential backoff"
+If no upstream failure findings exist in this run, derive your own from the trace dataset using the failure-mode protocol inline (\`searchTrace\` for STATUS_CODE_ERROR / MaxTurnsExceeded / etc.). But prefer to consume upstream findings when present \u2014 the kinds are designed to chain.
+Do NOT propose a fix you cannot defend with evidence. "Tighten the prompt" is not a finding; "Add 'When the user asks for X, always Y' to the system prompt section "request-classification"" is.
+OBSERVABILITY rules:
+- Each non-final turn must emit at least one \`console.log\` for evidence.
+- Call \`final({ findings: [...] })\` exactly once at the top level.`;
+var IMPROVEMENT_KIND_SPEC = {
+  id: "improvement",
+  description: "Converts upstream failure / gap / poisoning findings into concrete locus-named edits (prompt, tool-doc, RAG, scaffolding) with leverage grades.",
+  area: "improvement",
+  version: "1.0.0",
+  actorDescription: ACTOR_PROMPT2,
+  buildTools: (store) => buildTraceToolsForGroup("all", store),
+  recursion: { maxDepth: 3, maxParallelSubagents: 4 },
+  maxTurns: 30,
+  maxRuntimeChars: 12e3,
+  cost: { kind: "llm" }
+};
 // src/analyst/kinds/knowledge-gap.ts
-var ACTOR_PROMPT2 = `You are a knowledge-gap analyst for an OTLP trace dataset. Your job is to identify the **specific pieces of information the agent lacked, or that were stale**, that caused poor decisions.
+var ACTOR_PROMPT3 = `You are a knowledge-gap analyst for an OTLP trace dataset. Your job is to identify the **specific pieces of information the agent lacked, or that were stale**, that caused poor decisions.
 The agent under analysis maintains a curated knowledge base via \`@tangle-network/agent-knowledge\` \u2014 a wiki of \`KnowledgePage\`s with raw source anchors, claims, and relations. The primary expected store of agent-knowable facts IS that wiki. A "knowledge gap" is anything the agent had to discover or guess at run-time that the wiki should have held \u2014 or an outdated/contradictory fact the agent picked up from a non-wiki source.
@@ -1372,7 +1457,7 @@ var KNOWLEDGE_GAP_KIND_SPEC = {
   description: "Identifies missing or stale pieces of knowledge \u2014 primarily against the agent-knowledge wiki \u2014 and attributes each to the runtime layer (wiki page, claim, raw source, websearch, tool-doc, system-prompt, memory) that should have held it.",
   area: "knowledge-gap",
   version: "1.0.0",
-  actorDescription: ACTOR_PROMPT2,
+  actorDescription: ACTOR_PROMPT3,
   buildTools: (store) => buildTraceToolsForGroup("discoveryAndSearch", store),
   recursion: { maxDepth: 2, maxParallelSubagents: 4 },
   maxTurns: 18,
@@ -1380,7 +1465,7 @@ var KNOWLEDGE_GAP_KIND_SPEC = {
 };
 // src/analyst/kinds/knowledge-poisoning.ts
-var ACTOR_PROMPT3 = `You are a knowledge-poisoning analyst for an OTLP trace dataset. Your job is to identify cases where the agent **confidently used wrong information** \u2014 not where it lacked information (that's the knowledge-gap analyst).
+var ACTOR_PROMPT4 = `You are a knowledge-poisoning analyst for an OTLP trace dataset. Your job is to identify cases where the agent **confidently used wrong information** \u2014 not where it lacked information (that's the knowledge-gap analyst).
 DISCOVERY \u2192 DUAL-VERIFY \u2192 CITE protocol:
@@ -1420,64 +1505,13 @@ var KNOWLEDGE_POISONING_KIND_SPEC = {
   description: "Identifies confident-but-wrong actions caused by stale memory, contradicting RAG, deprecated tool docs, or outdated system-prompt instructions.",
   area: "knowledge-poisoning",
   version: "1.0.0",
-  actorDescription: ACTOR_PROMPT3,
+  actorDescription: ACTOR_PROMPT4,
   buildTools: (store) => buildTraceToolsForGroup("all", store),
   recursion: { maxDepth: 2, maxParallelSubagents: 4 },
   maxTurns: 20,
   cost: { kind: "llm" }
 };
-// src/analyst/kinds/improvement.ts
-var ACTOR_PROMPT4 = `You are a recursive-self-improvement analyst. Your job is to propose **concrete, locus-named edits** the agent's runtime should adopt to fix the failure modes, knowledge gaps, and poisonings present in this dataset.
-Upstream analysts have already classified the problems. Your job is to convert each problem into a *change to make* and grade its expected leverage. Each finding is one proposed edit.
-DISCOVERY \u2192 CANDIDATE-FIXES \u2192 COMPETE \u2192 CITE protocol:
-1. \`traces.getDatasetOverview({})\` first. Note the agents, tools, and any system-prompt fingerprints (look for the prompt text echoed in early spans).
-2. For each high-severity failure pattern, generate 2-3 candidate fixes. Real candidate axes:
-   - **System-prompt edit** \u2014 add an instruction, remove a misleading one, restructure precedence
-   - **Tool description edit** \u2014 rewrite a tool's description so the agent picks it correctly / passes valid args
-   - **New tool** \u2014 add a tool the agent kept emulating in code
-   - **RAG ingestion** \u2014 add a document or correct a stale one
-   - **Memory invalidation** \u2014 clear cached prior-run decisions that no longer apply
-   - **Scaffolding** \u2014 add a precondition check, a retry policy, a turn budget, a verification step
-   - **Output schema** \u2014 narrow the agent's output to forbid the failure shape
-3. **Compete candidate fixes via subagents.** For each failure cluster, spawn one \`llmQuery\` per candidate-fix axis you want to evaluate. Each subagent's job: simulate the fix on the cited traces and report (i) likely effect, (ii) side effects, (iii) implementation cost as small/medium/large. Pass the cluster's failing trace_ids and the candidate axis as context.
-4. After subagents return, **pick the winning candidate per cluster** based on (effect / cost) and emit ONE finding. Discard the losing candidates \u2014 the output is the recommendation, not the candidate set.
-5. **Cross-reference upstream findings.** If a finding cites a prior failure-mode or knowledge-gap finding, use \`evidence_uri = "finding://<prior-finding-id>"\` (the registry supports this kind). This builds the dependency graph that lets the dashboard show "fix #X resolves failure modes A, B, C."
-For each winning recommendation, emit ONE finding with:
-- \`area\` = "improvement"
-- \`subject\` = the locus to edit: \`system-prompt:<section>\`, \`tool-doc:<tool-name>\`, \`new-tool:<proposed-name>\`, \`rag:<corpus>:<doc-id>\`, \`memory:<key>\`, \`scaffolding:<concern>\`, \`output-schema:<field>\`
-- \`claim\` = one sentence stating the edit ("Add a precondition check to refuse tool X calls without arg Y")
-- \`severity\` = leverage rating: "critical" when fix resolves a critical failure mode; "high" when it resolves a high; "medium" when it's a quality-of-life win; "info" when it's a cleanup with no behavioral effect
-- \`evidence_uri\` = the failure-mode finding id this fix targets (\`finding://<id>\`) when it exists; else the most representative span
-- \`evidence_excerpt\` = a fragment showing the problem the fix targets
-- \`confidence\` = 0.85+ when the fix is mechanical and the failure mode is well-evidenced; 0.6-0.8 when the fix requires judgment; <0.5 for speculative
-- \`rationale\` = why this candidate beat its alternatives (2 sentences max)
-- \`recommended_action\` = the **literal edit**, phrased as a diff or a quoted replacement: "Replace section X with: '...'" or "Add tool with description: '...'" or "Set retry policy to max_attempts=3 with exponential backoff"
-If no upstream failure findings exist in this run, derive your own from the trace dataset using the failure-mode protocol inline (\`searchTrace\` for STATUS_CODE_ERROR / MaxTurnsExceeded / etc.). But prefer to consume upstream findings when present \u2014 the kinds are designed to chain.
-Do NOT propose a fix you cannot defend with evidence. "Tighten the prompt" is not a finding; "Add 'When the user asks for X, always Y' to the system prompt section "request-classification"" is.
-OBSERVABILITY rules:
-- Each non-final turn must emit at least one \`console.log\` for evidence.
-- Call \`final({ findings: [...] })\` exactly once at the top level.`;
-var IMPROVEMENT_KIND_SPEC = {
-  id: "improvement",
-  description: "Converts upstream failure / gap / poisoning findings into concrete locus-named edits (prompt, tool-doc, RAG, scaffolding) with leverage grades.",
-  area: "improvement",
-  version: "1.0.0",
-  actorDescription: ACTOR_PROMPT4,
-  buildTools: (store) => buildTraceToolsForGroup("all", store),
-  recursion: { maxDepth: 3, maxParallelSubagents: 4 },
-  maxTurns: 30,
-  maxRuntimeChars: 12e3,
-  cost: { kind: "llm" }
-};
 // src/analyst/kinds/index.ts
 var DEFAULT_TRACE_ANALYST_KINDS = [
   FAILURE_MODE_KIND_SPEC,
@@ -1556,7 +1590,8 @@ var AnalystRegistry = class {
         chat: this.options.chat,
         tags: runOpts.tags,
         log: (msg, fields) => log(`[${analyst.id}] ${msg}`, { runId, correlationId, ...fields }),
-        signal: runOpts.signal
+        signal: runOpts.signal,
+        priorFindings: selectPriorFindings(runOpts.priorFindings, analyst.id)
       };
       await hooks.onBeforeAnalyze?.({ analyst, ctx, runId });
       try {
@@ -1670,6 +1705,18 @@ function sumFindingCost(findings) {
   }
   return sum2;
 }
+function selectPriorFindings(source, analystId) {
+  if (!source) return void 0;
+  if (Array.isArray(source)) {
+    const own2 = source.filter((f) => f.analyst_id === analystId);
+    return own2.length > 0 ? own2 : void 0;
+  }
+  const record = source;
+  const own = record[analystId] ?? [];
+  const wildcard = record["*"] ?? [];
+  const merged = [...own, ...wildcard];
+  return merged.length > 0 ? merged : void 0;
+}
 // src/auto-pr.ts
 async function proposeAutomatedPullRequest(client, input) {
@@ -10320,6 +10367,7 @@ export {
   renderMarkdownReport,
   renderPlaybookMarkdown,
   renderPreferenceMemoryMarkdown,
+  renderPriorFindings,
   renderReleaseReport,
   renderSteeringText,
   replayFeedbackTrajectories,