npm - @tangle-network/agent-runtime - Versions diffs - 0.23.0 → 0.25.0 - Mend

@tangle-network/agent-runtime 0.23.0 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/README.md +85 -498
package/dist/agent.d.ts +5 -206
package/dist/chunk-GLR25NG7.js +92 -0
package/dist/chunk-GLR25NG7.js.map +1 -0
package/dist/{chunk-7HN72MF3.js → chunk-QZEDHTT2.js} +2 -2
package/dist/chunk-QZEDHTT2.js.map +1 -0
package/dist/{chunk-IQHYOJU3.js → chunk-ZJACJZF7.js} +289 -1
package/dist/chunk-ZJACJZF7.js.map +1 -0
package/dist/improvement-adapter-CaZxFxTd.d.ts +207 -0
package/dist/improvement.d.ts +120 -0
package/dist/improvement.js +161 -0
package/dist/improvement.js.map +1 -0
package/dist/index.js +7 -1
package/dist/index.js.map +1 -1
package/dist/local-harness-KrdFTY5R.d.ts +82 -0
package/dist/mcp/bin.js +2 -1
package/dist/mcp/bin.js.map +1 -1
package/dist/mcp/index.d.ts +190 -2
package/dist/mcp/index.js +21 -13
package/dist/mcp/index.js.map +1 -1
package/package.json +17 -23
package/dist/chunk-7HN72MF3.js.map +0 -1
package/dist/chunk-IQHYOJU3.js.map +0 -1

package/dist/improvement-adapter-CaZxFxTd.d.ts ADDED Viewed

@@ -0,0 +1,207 @@
+import { FindingSubject, AnalystFinding } from '@tangle-network/agent-eval';
+import { I as ImprovementAdapter } from './types-D_MXrmJP.js';
+/**
+ * `AgentSurfaces` — declarative map of the mutable file/directory paths
+ * the self-improvement loop can edit on behalf of an agent.
+ *
+ * The substrate uses this map to resolve every parsed `FindingSubject`
+ * (from agent-eval) to a real on-disk path. No per-vertical glue;
+ * no fabricated paths; no silent `existsSync(...)` skips that hide
+ * misconfiguration from the operator.
+ *
+ * Surfaces are validated at `defineAgent` time — missing paths fail
+ * loud with a list of every offender. A surface that's not needed
+ * (e.g. an agent with no RAG corpora) is simply omitted; the loop
+ * refuses to route those subjects rather than fabricating a target.
+ */
+/**
+ * Surface declarations. Every path is repo-relative (or absolute) at
+ * `defineAgent` time. At resolution time, paths are joined against the
+ * agent's `repoRoot`.
+ *
+ * `systemPrompt`, `tools`, `personas` are DIRECTORIES; the loop appends
+ * `<section>.md`, `<tool>/README.md`, `<persona-id>.yaml` etc.
+ * `rubric`, `outputSchema` are SINGLE FILES; the loop edits them in
+ * place.
+ *
+ * `knowledge` is the agent-knowledge root (typically `.agent-knowledge`);
+ * `applyKnowledgeWriteBlocks` writes pages relative to it.
+ *
+ * Optional surfaces (`scaffolding`, `memory`, `rag`, `outputSchema`)
+ * can be omitted — the loop will reject findings targeting them with a
+ * clear log message instead of fabricating a path.
+ */
+interface AgentSurfaces {
+    /** Directory containing one markdown file per system-prompt section. */
+    systemPrompt: string;
+    /** Directory containing one subdir per tool (`<tool>/README.md`). */
+    tools: string;
+    /** Single file (TypeScript module) defining the rubric weights + dimensions. */
+    rubric: string;
+    /** Knowledge-base root; typically `.agent-knowledge`. */
+    knowledge: string;
+    /** Directory containing one YAML/JSON file per persona. */
+    personas: string;
+    /** Optional: directory containing scaffolding rules (precondition checks, retry policies). */
+    scaffolding?: string;
+    /** Optional: memory store path (JSONL / SQLite / DB). */
+    memory?: string;
+    /** Optional: directory containing RAG corpora (`<corpus>/<doc-id>.md`). */
+    rag?: string;
+    /** Optional: single file defining the output schema (Zod / JSON Schema). */
+    outputSchema?: string;
+}
+interface ResolvedSurface {
+    /** Absolute filesystem path the operator can `cat` / `vim`. */
+    absolutePath: string;
+    /** Repo-relative path for PR descriptions, diffs, audit logs. */
+    repoRelativePath: string;
+    /** Whether the path currently exists on disk. */
+    exists: boolean;
+    /** The substrate's intent: edit an existing file or create a new one. */
+    intent: 'edit-existing' | 'create-new';
+}
+/**
+ * Resolve a parsed `FindingSubject` to the file path the substrate
+ * should edit (or create) on disk.
+ *
+ * Returns `null` when:
+ *   - the subject targets a surface the agent didn't declare
+ *     (e.g. `rag:*` when `surfaces.rag` is undefined), OR
+ *   - the subject is a `cluster` (failure-mode emits these as evidence,
+ *     not actionable mutations — they don't route to a file).
+ *
+ * Returns a `ResolvedSurface` with `intent: 'create-new'` when the
+ * subject names a path that doesn't yet exist (e.g. a new wiki page).
+ * The caller chooses whether to honour the create — for tightly-managed
+ * surfaces like `systemPrompt` it's usually a contract violation
+ * (the analyst named a section that doesn't exist); for `knowledge`
+ * it's the whole point.
+ */
+declare function resolveSubjectPath(subject: FindingSubject, surfaces: AgentSurfaces, repoRoot: string): ResolvedSurface | null;
+/**
+ * Validate that every declared surface exists on disk under `repoRoot`.
+ *
+ * Returns an array of `SurfaceValidationIssue` — empty when all required
+ * surfaces resolve. `defineAgent` throws with the issues rendered, so
+ * a misconfigured manifest fails at startup (not at the first finding
+ * the loop produces 20 minutes later).
+ */
+interface SurfaceValidationIssue {
+    surface: keyof AgentSurfaces;
+    path: string;
+    reason: 'missing' | 'not-directory' | 'not-file';
+}
+declare function validateSurfaces(surfaces: AgentSurfaces, repoRoot: string): ReadonlyArray<SurfaceValidationIssue>;
+declare function renderSurfaceIssues(issues: ReadonlyArray<SurfaceValidationIssue>, repoRoot: string): string;
+/**
+ * Substrate-default `ImprovementAdapter` — surfaces-driven, LLM-drafted
+ * patches, optional auto-apply or PR-open.
+ *
+ * This is the one ImprovementAdapter every vertical agent uses. The
+ * substrate parses each finding's `subject` via
+ * `parseFindingSubject` (agent-eval), resolves it to a real file path
+ * via the agent's `AgentSurfaces`, reads the current content, and asks
+ * an LLM to draft a unified-diff patch given the finding + current
+ * content + per-kind editing-discipline rules.
+ *
+ * Auto-apply gates on the source-finding's confidence and the
+ * autoApply.improvement policy. Two modes:
+ *   `write` — apply the patch in-place via `git apply -p0`. Operator
+ *     reviews via `git diff`.
+ *   `open-pr` — write to a branch, commit, push, open a PR via `gh`.
+ *     Operator reviews via the PR UI.
+ *
+ * Fail-loud rules:
+ *   - Findings whose subject doesn't parse → counted in `errors`.
+ *   - Findings whose subject targets an undeclared surface → counted in
+ *     `errors` with the offending kind in the message.
+ *   - Findings whose target path doesn't exist AND the kind isn't a
+ *     create-new variant (`new-tool`, `knowledge.wiki`) → counted in
+ *     `errors` with the resolved path in the message.
+ *   - LLM drafts that fail JSON-schema validation → counted in
+ *     `errors` with the schema issue.
+ *
+ * No silent skips. Every dropped finding has a recorded reason the
+ * loop's report surfaces.
+ */
+interface SurfaceImprovementEdit {
+    /** Stable id derived from the source finding so re-proposals are idempotent. */
+    id: string;
+    /** The finding that produced this edit — for revert + audit trail. */
+    sourceFindingId: string;
+    /** Parsed subject; included so the apply step doesn't re-parse. */
+    subject: FindingSubject;
+    /** Resolved on-disk target. */
+    target: ResolvedSurface;
+    /** SHA-256 of the current file content the patch was drafted against. */
+    baseSha256: string;
+    /** Unified-diff patch the LLM drafted (relative to `target.absolutePath`). */
+    patch: string;
+    /** One-line summary the operator sees in the report / PR title. */
+    summary: string;
+    /** Multi-line rationale for the PR body — finding context + LLM reasoning. */
+    rationale: string;
+    /** Carry-forward from the finding so the apply gate can check the threshold. */
+    confidence: number;
+    /** Carry-forward severity for prioritization. */
+    severity: AnalystFinding['severity'];
+}
+interface CreateSurfaceImprovementAdapterOpts {
+    surfaces: AgentSurfaces;
+    repoRoot: string;
+    /**
+     * LLM-draft callback. Given a finding + current file content + the
+     * resolved target, returns a unified-diff patch + summary + rationale.
+     *
+     * Required — the substrate doesn't ship a hardcoded prompt; the agent
+     * author picks the model (Haiku for cheap routine drafts, Sonnet for
+     * substantive prompt rewrites, etc.) via this callback.
+     */
+    draftPatch: (input: DraftPatchInput) => Promise<DraftPatchOutput>;
+    /**
+     * Apply mode:
+     *   `write` — `git apply` in-place; operator reviews via `git diff`
+     *   `open-pr` — branch + commit + push + `gh pr create`
+     *   `none` — never apply; collect proposals for the report only
+     *
+     * The `apply` method honours this even when the loop calls it; the
+     * effective behaviour is also gated on the per-finding confidence
+     * threshold via `runAnalystLoop`'s `autoApply` policy.
+     */
+    mode?: 'write' | 'open-pr' | 'none';
+    /** When `mode === 'open-pr'`, the base branch new PRs target. Default: `main`. */
+    baseBranch?: string;
+    /** Required for `mode === 'open-pr'` — the GH owner/repo (`tangle-network/tax-agent`). */
+    ghRepo?: string;
+    /**
+     * When the resolved target doesn't exist, allow the substrate to
+     * CREATE the file (for `knowledge.wiki`, `new-tool` subjects). Default
+     * true for those kinds, false for `system-prompt` / `rubric` / etc.
+     * (named sections that don't exist are a contract violation, not a
+     * scaffolding opportunity).
+     */
+    allowCreateForKinds?: ReadonlyArray<FindingSubject['kind']>;
+}
+interface DraftPatchInput {
+    finding: AnalystFinding;
+    subject: FindingSubject;
+    target: ResolvedSurface;
+    /** Current file content (empty string when `intent === 'create-new'`). */
+    currentContent: string;
+}
+interface DraftPatchOutput {
+    /** Unified diff against the current file content. Empty string skips this finding. */
+    patch: string;
+    /** One-line summary for the operator. */
+    summary: string;
+    /** Multi-line rationale for the PR body. */
+    rationale: string;
+}
+declare function createSurfaceImprovementAdapter(opts: CreateSurfaceImprovementAdapterOpts): ImprovementAdapter<SurfaceImprovementEdit>;
+export { type AgentSurfaces as A, type CreateSurfaceImprovementAdapterOpts as C, type DraftPatchInput as D, type ResolvedSurface as R, type SurfaceImprovementEdit as S, type DraftPatchOutput as a, type SurfaceValidationIssue as b, createSurfaceImprovementAdapter as c, resolveSubjectPath as d, renderSurfaceIssues as r, validateSurfaces as v };

package/dist/improvement.d.ts ADDED Viewed

@@ -0,0 +1,120 @@
+import { AnalystFinding } from '@tangle-network/agent-eval';
+import { L as LocalHarness, r as runLocalHarness } from './local-harness-KrdFTY5R.js';
+import { LabeledScenarioStore, WorktreeAdapter, ImprovementDriver } from '@tangle-network/agent-eval/campaign';
+import { S as SurfaceImprovementEdit } from './improvement-adapter-CaZxFxTd.js';
+import { I as ImprovementAdapter } from './types-D_MXrmJP.js';
+import 'node:child_process';
+/**
+ * @experimental
+ *
+ * `improvementDriver` — the ONE reflective/agentic improvement driver for
+ * agent-eval's improvement loop. It implements `ImprovementDriver` and owns
+ * the candidate lifecycle (worktree create → generate → finalize/discard,
+ * × populationSize); it delegates the only thing that genuinely varies — HOW
+ * a candidate change is produced — to a pluggable `CandidateGenerator`.
+ *
+ * There is no separate "analyst driver" vs "autoresearch driver": those are
+ * the SAME driver at two settings of a dial.
+ *   - cheap reflective path  → `reflectiveGenerator` (shots=1, no sandbox;
+ *                              applies pre-drafted patches)
+ *   - full agentic path      → `agenticGenerator` (shots=N, sandbox runLoop;
+ *                              an agent reads code + report and edits)
+ * Both emit changes into a worktree the driver finalizes into a
+ * `CodeSurface{ worktreeRef }` the loop measures on the holdout. See
+ * agent-eval's `docs/design/self-improvement-engine.md`.
+ */
+/** The byte-producing seam — the ONE thing that differs between the cheap
+ *  reflective path and the full agentic path. A generator makes (uncommitted)
+ *  changes inside `worktreePath`; the driver commits them via the worktree
+ *  adapter's `finalize`. */
+interface CandidateGenerator {
+    kind: string;
+    generate(args: {
+        /** The candidate worktree — a fresh checkout of baseRef. Write changes here. */
+        worktreePath: string;
+        /** Phase-2 research report (analyst findings + diff), opaque. */
+        report: unknown;
+        /** Findings resolved from the report or the loop context. */
+        findings: AnalystFinding[];
+        /** Handle to all captured data, to ground the change. */
+        dataset?: LabeledScenarioStore;
+        /** DEPTH: max iterations the generator may take (agentic uses this; the
+         *  reflective generator ignores it). */
+        maxShots: number;
+        signal: AbortSignal;
+    }): Promise<{
+        applied: boolean;
+        summary: string;
+    }>;
+}
+interface ImprovementDriverOptions {
+    worktree: WorktreeAdapter;
+    generator: CandidateGenerator;
+    /** Base ref candidate worktrees fork from. Default `main`. */
+    baseRef?: string;
+}
+declare function improvementDriver(opts: ImprovementDriverOptions): ImprovementDriver<AnalystFinding>;
+/**
+ * @experimental
+ *
+ * `agenticGenerator` — the full-agentic `CandidateGenerator`: the
+ * `shots=N, sandbox=on` setting of the one `improvementDriver`. It runs a real
+ * coding harness (claude / codex / opencode) inside the candidate worktree the
+ * driver already created, letting the agent read the codebase + the research
+ * report and make the change in place. The driver then commits the worktree
+ * into a `CodeSurface`.
+ *
+ * Mechanism: identical to the proven Phase-2.8 in-process executor — spawn the
+ * harness as a subprocess with `cwd` = the worktree, on the same filesystem,
+ * so edits land in place (no sandbox-mount round-trip). `runLocalHarness` is
+ * the verified primitive. The OUTER sandbox is the improvement loop's own
+ * execution context; the generator does not nest a second sandbox per
+ * candidate (which would reintroduce a host↔sandbox worktree-transport
+ * problem that does not need solving here).
+ *
+ * `maxShots` is the DEPTH dial: the harness runs once; if it produced no change
+ * (the worktree stays clean), the generator refines the prompt and retries, up
+ * to `maxShots` times. A harness that already changed files returns on shot 1.
+ */
+interface AgenticGeneratorOptions {
+    /** Local coding harness to run in the worktree. Default `claude`. */
+    harness?: LocalHarness;
+    /** Per-shot wall-clock timeout (ms). Default = `runLocalHarness` default (5m). */
+    timeoutMs?: number;
+    /** Build the harness task prompt from the report + findings. Override for
+     *  domain phrasing; the default turns findings into a concrete coder task. */
+    buildPrompt?: (args: {
+        report: unknown;
+        findings: AnalystFinding[];
+    }) => string;
+    /** Test seam — inject the harness runner (defaults to `runLocalHarness`). */
+    runHarness?: typeof runLocalHarness;
+    /** Test seam — inject the worktree-dirty check (defaults to `git status`). */
+    isDirty?: (worktreePath: string) => boolean;
+}
+declare function agenticGenerator(opts?: AgenticGeneratorOptions): CandidateGenerator;
+/**
+ * @experimental
+ *
+ * `reflectiveGenerator` — the cheap, no-sandbox `CandidateGenerator`. It drafts
+ * surface edits via the existing improvement adapter (`proposeFromFindings`,
+ * one LLM patch per finding) and applies them as ONE coherent improvement into
+ * the candidate worktree. `maxShots` is ignored — reflection is single-shot by
+ * construction (the patches are already drafted).
+ *
+ * This is the `shots=1, sandbox=off` setting of the one improvement driver.
+ * The `agenticGenerator` (sandbox runLoop) is the `shots=N, sandbox=on`
+ * setting — both plug into the same `improvementDriver`.
+ */
+interface ReflectiveGeneratorOptions {
+    improvementAdapter: ImprovementAdapter<SurfaceImprovementEdit>;
+}
+declare function reflectiveGenerator(opts: ReflectiveGeneratorOptions): CandidateGenerator;
+export { type AgenticGeneratorOptions, type CandidateGenerator, type ImprovementDriverOptions, type ReflectiveGeneratorOptions, agenticGenerator, improvementDriver, reflectiveGenerator };

package/dist/improvement.js ADDED Viewed

@@ -0,0 +1,161 @@
+import {
+  runLocalHarness
+} from "./chunk-GLR25NG7.js";
+import "./chunk-DGUM43GV.js";
+// src/improvement/agentic-generator.ts
+import { spawnSync } from "child_process";
+function agenticGenerator(opts = {}) {
+  const harness = opts.harness ?? "claude";
+  const buildPrompt = opts.buildPrompt ?? defaultBuildPrompt;
+  const run = opts.runHarness ?? runLocalHarness;
+  const dirty = opts.isDirty ?? worktreeDirty;
+  return {
+    kind: `agentic:${harness}`,
+    async generate({ worktreePath, report, findings, maxShots, signal }) {
+      let prompt = buildPrompt({ report, findings });
+      const shots = Math.max(1, maxShots);
+      for (let shot = 0; shot < shots; shot++) {
+        if (signal.aborted) break;
+        await run({
+          harness,
+          cwd: worktreePath,
+          taskPrompt: prompt,
+          timeoutMs: opts.timeoutMs,
+          signal
+        });
+        if (dirty(worktreePath)) {
+          return { applied: true, summary: summarize(findings) };
+        }
+        prompt = refine(prompt);
+      }
+      return { applied: false, summary: "" };
+    }
+  };
+}
+function defaultBuildPrompt(args) {
+  const lines = [
+    "You are improving this codebase based on an evaluation analysis.",
+    "Make the smallest set of edits that addresses the findings below, then stop.",
+    "Do not change unrelated code. Do not commit \u2014 leave changes in the working tree.",
+    "",
+    "Findings:"
+  ];
+  for (const f of args.findings) {
+    const where = f.subject ? ` [${f.subject}]` : "";
+    lines.push(`- (${f.severity})${where} ${f.claim}`);
+    if (f.recommended_action) lines.push(`    \u2192 ${f.recommended_action}`);
+  }
+  return lines.join("\n");
+}
+function refine(prompt) {
+  return `${prompt}
+NOTE: your previous attempt left the working tree unchanged. Make the concrete file edits now.`;
+}
+function summarize(findings) {
+  if (findings.length === 0) return "agentic improvement";
+  if (findings.length === 1) return `agentic: ${truncate(findings[0].claim, 64)}`;
+  return `agentic: ${findings.length} findings addressed`;
+}
+function truncate(s, n) {
+  return s.length <= n ? s : `${s.slice(0, n - 1)}\u2026`;
+}
+function worktreeDirty(worktreePath) {
+  const result = spawnSync("git", ["status", "--porcelain"], {
+    cwd: worktreePath,
+    encoding: "utf-8"
+  });
+  if (result.error) {
+    throw new Error(
+      `agenticGenerator: git status failed to spawn in ${worktreePath}: ${result.error.message}`
+    );
+  }
+  if (result.status !== 0) {
+    throw new Error(
+      `agenticGenerator: git status exited ${result.status} in ${worktreePath}: ${result.stderr.trim()}`
+    );
+  }
+  return result.stdout.trim().length > 0;
+}
+// src/improvement/improvement-driver.ts
+function improvementDriver(opts) {
+  const baseRef = opts.baseRef ?? "main";
+  return {
+    kind: `improvement:${opts.generator.kind}`,
+    async propose(ctx) {
+      const findings = resolveFindings(ctx);
+      if (findings.length === 0 && ctx.report === void 0) return [];
+      const surfaces = [];
+      for (let i = 0; i < ctx.populationSize; i++) {
+        if (ctx.signal.aborted) break;
+        const wt = await opts.worktree.create({
+          baseRef,
+          label: `${opts.generator.kind}-gen${ctx.generation}-cand${i}`
+        });
+        try {
+          const { applied, summary } = await opts.generator.generate({
+            worktreePath: wt.path,
+            report: ctx.report,
+            findings,
+            dataset: ctx.dataset,
+            maxShots: ctx.maxImprovementShots ?? 1,
+            signal: ctx.signal
+          });
+          if (!applied) {
+            await opts.worktree.discard(wt);
+            continue;
+          }
+          surfaces.push(await opts.worktree.finalize(wt, summary));
+        } catch (err) {
+          await opts.worktree.discard(wt).catch(() => {
+          });
+          throw err;
+        }
+      }
+      return surfaces;
+    }
+  };
+}
+function resolveFindings(ctx) {
+  const report = ctx.report;
+  if (report && typeof report === "object" && "findings" in report) {
+    const f = report.findings;
+    if (Array.isArray(f) && f.length > 0) return f;
+  }
+  return ctx.findings;
+}
+// src/improvement/reflective-generator.ts
+import { spawnSync as spawnSync2 } from "child_process";
+function reflectiveGenerator(opts) {
+  return {
+    kind: "reflective",
+    async generate({ worktreePath, findings }) {
+      const batch = await opts.improvementAdapter.proposeFromFindings(findings);
+      if (batch.edits.length === 0) return { applied: false, summary: "" };
+      let applied = 0;
+      for (const edit of batch.edits) {
+        if (applyPatch(edit.patch, worktreePath)) applied++;
+      }
+      if (applied === 0) return { applied: false, summary: "" };
+      const summary = batch.edits.length === 1 ? batch.edits[0].summary : `analyst: ${applied} surface edit${applied === 1 ? "" : "s"}`;
+      return { applied: true, summary };
+    }
+  };
+}
+function applyPatch(patch, cwd) {
+  const result = spawnSync2("git", ["apply", "--whitespace=fix", "-p0", "-"], {
+    cwd,
+    input: patch,
+    encoding: "utf-8"
+  });
+  return result.status === 0;
+}
+export {
+  agenticGenerator,
+  improvementDriver,
+  reflectiveGenerator
+};
+//# sourceMappingURL=improvement.js.map

package/dist/improvement.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"sources":["../src/improvement/agentic-generator.ts","../src/improvement/improvement-driver.ts","../src/improvement/reflective-generator.ts"],"sourcesContent":["/**\n * @experimental\n *\n * `agenticGenerator` — the full-agentic `CandidateGenerator`: the\n * `shots=N, sandbox=on` setting of the one `improvementDriver`. It runs a real\n * coding harness (claude / codex / opencode) inside the candidate worktree the\n * driver already created, letting the agent read the codebase + the research\n * report and make the change in place. The driver then commits the worktree\n * into a `CodeSurface`.\n *\n * Mechanism: identical to the proven Phase-2.8 in-process executor — spawn the\n * harness as a subprocess with `cwd` = the worktree, on the same filesystem,\n * so edits land in place (no sandbox-mount round-trip). `runLocalHarness` is\n * the verified primitive. The OUTER sandbox is the improvement loop's own\n * execution context; the generator does not nest a second sandbox per\n * candidate (which would reintroduce a host↔sandbox worktree-transport\n * problem that does not need solving here).\n *\n * `maxShots` is the DEPTH dial: the harness runs once; if it produced no change\n * (the worktree stays clean), the generator refines the prompt and retries, up\n * to `maxShots` times. A harness that already changed files returns on shot 1.\n */\n\nimport { spawnSync } from 'node:child_process'\nimport type { AnalystFinding } from '@tangle-network/agent-eval'\nimport { type LocalHarness, runLocalHarness } from '../mcp/local-harness'\nimport type { CandidateGenerator } from './improvement-driver'\n\nexport interface AgenticGeneratorOptions {\n /** Local coding harness to run in the worktree. Default `claude`. */\n harness?: LocalHarness\n /** Per-shot wall-clock timeout (ms). Default = `runLocalHarness` default (5m). */\n timeoutMs?: number\n /** Build the harness task prompt from the report + findings. Override for\n * domain phrasing; the default turns findings into a concrete coder task. */\n buildPrompt?: (args: { report: unknown; findings: AnalystFinding[] }) => string\n /** Test seam — inject the harness runner (defaults to `runLocalHarness`). */\n runHarness?: typeof runLocalHarness\n /** Test seam — inject the worktree-dirty check (defaults to `git status`). */\n isDirty?: (worktreePath: string) => boolean\n}\n\nexport function agenticGenerator(opts: AgenticGeneratorOptions = {}): CandidateGenerator {\n const harness = opts.harness ?? 'claude'\n const buildPrompt = opts.buildPrompt ?? defaultBuildPrompt\n const run = opts.runHarness ?? runLocalHarness\n const dirty = opts.isDirty ?? worktreeDirty\n\n return {\n kind: `agentic:${harness}`,\n async generate({ worktreePath, report, findings, maxShots, signal }) {\n let prompt = buildPrompt({ report, findings })\n const shots = Math.max(1, maxShots)\n\n for (let shot = 0; shot < shots; shot++) {\n if (signal.aborted) break\n await run({\n harness,\n cwd: worktreePath,\n taskPrompt: prompt,\n timeoutMs: opts.timeoutMs,\n signal,\n })\n // The worktree IS the signal: if the harness touched files, we have a\n // candidate. We don't trust the harness's stdout — we trust the diff.\n if (dirty(worktreePath)) {\n return { applied: true, summary: summarize(findings) }\n }\n // No change this shot — give the next attempt explicit feedback.\n prompt = refine(prompt)\n }\n return { applied: false, summary: '' }\n },\n }\n}\n\n/** Turn the analyst's findings (+ optional report) into a concrete coder task. */\nfunction defaultBuildPrompt(args: { report: unknown; findings: AnalystFinding[] }): string {\n const lines: string[] = [\n 'You are improving this codebase based on an evaluation analysis.',\n 'Make the smallest set of edits that addresses the findings below, then stop.',\n 'Do not change unrelated code. Do not commit — leave changes in the working tree.',\n '',\n 'Findings:',\n ]\n for (const f of args.findings) {\n const where = f.subject ? ` [${f.subject}]` : ''\n lines.push(`- (${f.severity})${where} ${f.claim}`)\n if (f.recommended_action) lines.push(` → ${f.recommended_action}`)\n }\n return lines.join('\\n')\n}\n\nfunction refine(prompt: string): string {\n return `${prompt}\\n\\nNOTE: your previous attempt left the working tree unchanged. Make the concrete file edits now.`\n}\n\n/** A one-line summary for the commit message, derived from the findings. */\nfunction summarize(findings: AnalystFinding[]): string {\n if (findings.length === 0) return 'agentic improvement'\n if (findings.length === 1) return `agentic: ${truncate(findings[0]!.claim, 64)}`\n return `agentic: ${findings.length} findings addressed`\n}\n\nfunction truncate(s: string, n: number): string {\n return s.length <= n ? s : `${s.slice(0, n - 1)}…`\n}\n\n/** Non-empty `git status --porcelain` ⇒ the harness changed the worktree.\n * Fails loud: the worktree is a fresh checkout, so a git error here means\n * something is genuinely broken (git missing, corrupt index, killed mid-run).\n * Folding that into `false` would silently discard a candidate and mask the\n * real failure — forbidden by the no-silent-fallbacks doctrine. */\nfunction worktreeDirty(worktreePath: string): boolean {\n const result = spawnSync('git', ['status', '--porcelain'], {\n cwd: worktreePath,\n encoding: 'utf-8',\n })\n if (result.error) {\n throw new Error(\n `agenticGenerator: git status failed to spawn in ${worktreePath}: ${result.error.message}`,\n )\n }\n if (result.status !== 0) {\n throw new Error(\n `agenticGenerator: git status exited ${result.status} in ${worktreePath}: ${result.stderr.trim()}`,\n )\n }\n return result.stdout.trim().length > 0\n}\n","/**\n * @experimental\n *\n * `improvementDriver` — the ONE reflective/agentic improvement driver for\n * agent-eval's improvement loop. It implements `ImprovementDriver` and owns\n * the candidate lifecycle (worktree create → generate → finalize/discard,\n * × populationSize); it delegates the only thing that genuinely varies — HOW\n * a candidate change is produced — to a pluggable `CandidateGenerator`.\n *\n * There is no separate \"analyst driver\" vs \"autoresearch driver\": those are\n * the SAME driver at two settings of a dial.\n * - cheap reflective path → `reflectiveGenerator` (shots=1, no sandbox;\n * applies pre-drafted patches)\n * - full agentic path → `agenticGenerator` (shots=N, sandbox runLoop;\n * an agent reads code + report and edits)\n * Both emit changes into a worktree the driver finalizes into a\n * `CodeSurface{ worktreeRef }` the loop measures on the holdout. See\n * agent-eval's `docs/design/self-improvement-engine.md`.\n */\n\nimport type { AnalystFinding } from '@tangle-network/agent-eval'\nimport type {\n CodeSurface,\n ImprovementDriver,\n LabeledScenarioStore,\n ProposeContext,\n WorktreeAdapter,\n} from '@tangle-network/agent-eval/campaign'\n\n/** The byte-producing seam — the ONE thing that differs between the cheap\n * reflective path and the full agentic path. A generator makes (uncommitted)\n * changes inside `worktreePath`; the driver commits them via the worktree\n * adapter's `finalize`. */\nexport interface CandidateGenerator {\n kind: string\n generate(args: {\n /** The candidate worktree — a fresh checkout of baseRef. Write changes here. */\n worktreePath: string\n /** Phase-2 research report (analyst findings + diff), opaque. */\n report: unknown\n /** Findings resolved from the report or the loop context. */\n findings: AnalystFinding[]\n /** Handle to all captured data, to ground the change. */\n dataset?: LabeledScenarioStore\n /** DEPTH: max iterations the generator may take (agentic uses this; the\n * reflective generator ignores it). */\n maxShots: number\n signal: AbortSignal\n }): Promise<{ applied: boolean; summary: string }>\n}\n\nexport interface ImprovementDriverOptions {\n worktree: WorktreeAdapter\n generator: CandidateGenerator\n /** Base ref candidate worktrees fork from. Default `main`. */\n baseRef?: string\n}\n\nexport function improvementDriver(\n opts: ImprovementDriverOptions,\n): ImprovementDriver<AnalystFinding> {\n const baseRef = opts.baseRef ?? 'main'\n\n return {\n kind: `improvement:${opts.generator.kind}`,\n async propose(ctx) {\n const findings = resolveFindings(ctx)\n // No signal to act on — propose nothing rather than spin up worktrees.\n if (findings.length === 0 && ctx.report === undefined) return []\n\n const surfaces: CodeSurface[] = []\n for (let i = 0; i < ctx.populationSize; i++) {\n if (ctx.signal.aborted) break\n const wt = await opts.worktree.create({\n baseRef,\n label: `${opts.generator.kind}-gen${ctx.generation}-cand${i}`,\n })\n // Once a worktree exists it MUST be accounted for: finalized into a\n // surface, or discarded. A throw from generate()/finalize() must not\n // leak the worktree + branch — discard best-effort, then rethrow loud.\n try {\n const { applied, summary } = await opts.generator.generate({\n worktreePath: wt.path,\n report: ctx.report,\n findings,\n dataset: ctx.dataset,\n maxShots: ctx.maxImprovementShots ?? 1,\n signal: ctx.signal,\n })\n if (!applied) {\n await opts.worktree.discard(wt)\n continue\n }\n surfaces.push(await opts.worktree.finalize(wt, summary))\n } catch (err) {\n // Best-effort cleanup; never mask the original failure.\n await opts.worktree.discard(wt).catch(() => {})\n throw err\n }\n }\n return surfaces\n },\n }\n}\n\n/** Phase-2 report carries `findings` when present; else fall back to the\n * loop's `ctx.findings`. The report is opaque to the substrate, so probe it\n * structurally. */\nfunction resolveFindings(ctx: ProposeContext<AnalystFinding>): AnalystFinding[] {\n const report = ctx.report\n if (report && typeof report === 'object' && 'findings' in report) {\n const f = (report as { findings: unknown }).findings\n if (Array.isArray(f) && f.length > 0) return f as AnalystFinding[]\n }\n return ctx.findings\n}\n","/**\n * @experimental\n *\n * `reflectiveGenerator` — the cheap, no-sandbox `CandidateGenerator`. It drafts\n * surface edits via the existing improvement adapter (`proposeFromFindings`,\n * one LLM patch per finding) and applies them as ONE coherent improvement into\n * the candidate worktree. `maxShots` is ignored — reflection is single-shot by\n * construction (the patches are already drafted).\n *\n * This is the `shots=1, sandbox=off` setting of the one improvement driver.\n * The `agenticGenerator` (sandbox runLoop) is the `shots=N, sandbox=on`\n * setting — both plug into the same `improvementDriver`.\n */\n\nimport { spawnSync } from 'node:child_process'\nimport type { SurfaceImprovementEdit } from '../agent/improvement-adapter'\nimport type { ImprovementAdapter } from '../analyst-loop/types'\nimport type { CandidateGenerator } from './improvement-driver'\n\nexport interface ReflectiveGeneratorOptions {\n improvementAdapter: ImprovementAdapter<SurfaceImprovementEdit>\n}\n\nexport function reflectiveGenerator(opts: ReflectiveGeneratorOptions): CandidateGenerator {\n return {\n kind: 'reflective',\n async generate({ worktreePath, findings }) {\n const batch = await opts.improvementAdapter.proposeFromFindings(findings)\n if (batch.edits.length === 0) return { applied: false, summary: '' }\n\n let applied = 0\n for (const edit of batch.edits) {\n if (applyPatch(edit.patch, worktreePath)) applied++\n }\n if (applied === 0) return { applied: false, summary: '' }\n\n const summary =\n batch.edits.length === 1\n ? batch.edits[0]!.summary\n : `analyst: ${applied} surface edit${applied === 1 ? '' : 's'}`\n return { applied: true, summary }\n },\n }\n}\n\n/** Mirror the improvement adapter's proven apply invocation, run inside the\n * candidate worktree (a fresh checkout of baseRef, so `-p0` paths match). */\nfunction applyPatch(patch: string, cwd: string): boolean {\n const result = spawnSync('git', ['apply', '--whitespace=fix', '-p0', '-'], {\n cwd,\n input: patch,\n encoding: 'utf-8',\n })\n return result.status === 0\n}\n"],"mappings":";;;;;;AAuBA,SAAS,iBAAiB;AAmBnB,SAAS,iBAAiB,OAAgC,CAAC,GAAuB;AACvF,QAAM,UAAU,KAAK,WAAW;AAChC,QAAM,cAAc,KAAK,eAAe;AACxC,QAAM,MAAM,KAAK,cAAc;AAC/B,QAAM,QAAQ,KAAK,WAAW;AAE9B,SAAO;AAAA,IACL,MAAM,WAAW,OAAO;AAAA,IACxB,MAAM,SAAS,EAAE,cAAc,QAAQ,UAAU,UAAU,OAAO,GAAG;AACnE,UAAI,SAAS,YAAY,EAAE,QAAQ,SAAS,CAAC;AAC7C,YAAM,QAAQ,KAAK,IAAI,GAAG,QAAQ;AAElC,eAAS,OAAO,GAAG,OAAO,OAAO,QAAQ;AACvC,YAAI,OAAO,QAAS;AACpB,cAAM,IAAI;AAAA,UACR;AAAA,UACA,KAAK;AAAA,UACL,YAAY;AAAA,UACZ,WAAW,KAAK;AAAA,UAChB;AAAA,QACF,CAAC;AAGD,YAAI,MAAM,YAAY,GAAG;AACvB,iBAAO,EAAE,SAAS,MAAM,SAAS,UAAU,QAAQ,EAAE;AAAA,QACvD;AAEA,iBAAS,OAAO,MAAM;AAAA,MACxB;AACA,aAAO,EAAE,SAAS,OAAO,SAAS,GAAG;AAAA,IACvC;AAAA,EACF;AACF;AAGA,SAAS,mBAAmB,MAA+D;AACzF,QAAM,QAAkB;AAAA,IACtB;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACA,aAAW,KAAK,KAAK,UAAU;AAC7B,UAAM,QAAQ,EAAE,UAAU,KAAK,EAAE,OAAO,MAAM;AAC9C,UAAM,KAAK,MAAM,EAAE,QAAQ,IAAI,KAAK,IAAI,EAAE,KAAK,EAAE;AACjD,QAAI,EAAE,mBAAoB,OAAM,KAAK,cAAS,EAAE,kBAAkB,EAAE;AAAA,EACtE;AACA,SAAO,MAAM,KAAK,IAAI;AACxB;AAEA,SAAS,OAAO,QAAwB;AACtC,SAAO,GAAG,MAAM;AAAA;AAAA;AAClB;AAGA,SAAS,UAAU,UAAoC;AACrD,MAAI,SAAS,WAAW,EAAG,QAAO;AAClC,MAAI,SAAS,WAAW,EAAG,QAAO,YAAY,SAAS,SAAS,CAAC,EAAG,OAAO,EAAE,CAAC;AAC9E,SAAO,YAAY,SAAS,MAAM;AACpC;AAEA,SAAS,SAAS,GAAW,GAAmB;AAC9C,SAAO,EAAE,UAAU,IAAI,IAAI,GAAG,EAAE,MAAM,GAAG,IAAI,CAAC,CAAC;AACjD;AAOA,SAAS,cAAc,cAA+B;AACpD,QAAM,SAAS,UAAU,OAAO,CAAC,UAAU,aAAa,GAAG;AAAA,IACzD,KAAK;AAAA,IACL,UAAU;AAAA,EACZ,CAAC;AACD,MAAI,OAAO,OAAO;AAChB,UAAM,IAAI;AAAA,MACR,mDAAmD,YAAY,KAAK,OAAO,MAAM,OAAO;AAAA,IAC1F;AAAA,EACF;AACA,MAAI,OAAO,WAAW,GAAG;AACvB,UAAM,IAAI;AAAA,MACR,uCAAuC,OAAO,MAAM,OAAO,YAAY,KAAK,OAAO,OAAO,KAAK,CAAC;AAAA,IAClG;AAAA,EACF;AACA,SAAO,OAAO,OAAO,KAAK,EAAE,SAAS;AACvC;;;ACvEO,SAAS,kBACd,MACmC;AACnC,QAAM,UAAU,KAAK,WAAW;AAEhC,SAAO;AAAA,IACL,MAAM,eAAe,KAAK,UAAU,IAAI;AAAA,IACxC,MAAM,QAAQ,KAAK;AACjB,YAAM,WAAW,gBAAgB,GAAG;AAEpC,UAAI,SAAS,WAAW,KAAK,IAAI,WAAW,OAAW,QAAO,CAAC;AAE/D,YAAM,WAA0B,CAAC;AACjC,eAAS,IAAI,GAAG,IAAI,IAAI,gBAAgB,KAAK;AAC3C,YAAI,IAAI,OAAO,QAAS;AACxB,cAAM,KAAK,MAAM,KAAK,SAAS,OAAO;AAAA,UACpC;AAAA,UACA,OAAO,GAAG,KAAK,UAAU,IAAI,OAAO,IAAI,UAAU,QAAQ,CAAC;AAAA,QAC7D,CAAC;AAID,YAAI;AACF,gBAAM,EAAE,SAAS,QAAQ,IAAI,MAAM,KAAK,UAAU,SAAS;AAAA,YACzD,cAAc,GAAG;AAAA,YACjB,QAAQ,IAAI;AAAA,YACZ;AAAA,YACA,SAAS,IAAI;AAAA,YACb,UAAU,IAAI,uBAAuB;AAAA,YACrC,QAAQ,IAAI;AAAA,UACd,CAAC;AACD,cAAI,CAAC,SAAS;AACZ,kBAAM,KAAK,SAAS,QAAQ,EAAE;AAC9B;AAAA,UACF;AACA,mBAAS,KAAK,MAAM,KAAK,SAAS,SAAS,IAAI,OAAO,CAAC;AAAA,QACzD,SAAS,KAAK;AAEZ,gBAAM,KAAK,SAAS,QAAQ,EAAE,EAAE,MAAM,MAAM;AAAA,UAAC,CAAC;AAC9C,gBAAM;AAAA,QACR;AAAA,MACF;AACA,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAKA,SAAS,gBAAgB,KAAuD;AAC9E,QAAM,SAAS,IAAI;AACnB,MAAI,UAAU,OAAO,WAAW,YAAY,cAAc,QAAQ;AAChE,UAAM,IAAK,OAAiC;AAC5C,QAAI,MAAM,QAAQ,CAAC,KAAK,EAAE,SAAS,EAAG,QAAO;AAAA,EAC/C;AACA,SAAO,IAAI;AACb;;;ACrGA,SAAS,aAAAA,kBAAiB;AASnB,SAAS,oBAAoB,MAAsD;AACxF,SAAO;AAAA,IACL,MAAM;AAAA,IACN,MAAM,SAAS,EAAE,cAAc,SAAS,GAAG;AACzC,YAAM,QAAQ,MAAM,KAAK,mBAAmB,oBAAoB,QAAQ;AACxE,UAAI,MAAM,MAAM,WAAW,EAAG,QAAO,EAAE,SAAS,OAAO,SAAS,GAAG;AAEnE,UAAI,UAAU;AACd,iBAAW,QAAQ,MAAM,OAAO;AAC9B,YAAI,WAAW,KAAK,OAAO,YAAY,EAAG;AAAA,MAC5C;AACA,UAAI,YAAY,EAAG,QAAO,EAAE,SAAS,OAAO,SAAS,GAAG;AAExD,YAAM,UACJ,MAAM,MAAM,WAAW,IACnB,MAAM,MAAM,CAAC,EAAG,UAChB,YAAY,OAAO,gBAAgB,YAAY,IAAI,KAAK,GAAG;AACjE,aAAO,EAAE,SAAS,MAAM,QAAQ;AAAA,IAClC;AAAA,EACF;AACF;AAIA,SAAS,WAAW,OAAe,KAAsB;AACvD,QAAM,SAASA,WAAU,OAAO,CAAC,SAAS,oBAAoB,OAAO,GAAG,GAAG;AAAA,IACzE;AAAA,IACA,OAAO;AAAA,IACP,UAAU;AAAA,EACZ,CAAC;AACD,SAAO,OAAO,WAAW;AAC3B;","names":["spawnSync"]}

package/dist/index.js CHANGED Viewed

@@ -3,7 +3,7 @@ import {
   loopEventToOtelSpan,
   mcpToolsForRuntimeMcp,
   mcpToolsForRuntimeMcpSubset
-} from "./chunk-7HN72MF3.js";
+} from "./chunk-QZEDHTT2.js";
 import "./chunk-UNQM6XQO.js";
 import {
   AgentEvalError,
@@ -395,6 +395,12 @@ async function* streamResponseEvents(response, context, requestedModel) {
 function* parseStreamChunk(chunk, context, usage, toolCalls) {
   const lines = chunk.split(/\r?\n/);
   const dataLines = lines.filter((line) => line.startsWith("data:"));
+  if (dataLines.length === 0 && lines.every((line) => {
+    const trimmed = line.trim();
+    return trimmed.length === 0 || trimmed.startsWith(":");
+  })) {
+    return;
+  }
   const data = dataLines.length > 0 ? dataLines.map((line) => line.slice(5).trimStart()).join("\n") : chunk.trim();
   if (!data || data === "[DONE]") return;
   let parsed;