npm - @tangle-network/agent-eval - Versions diffs - 0.17.0 → 0.17.2 - Mend

@tangle-network/agent-eval 0.17.0 → 0.17.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/index.js CHANGED Viewed

@@ -1386,6 +1386,1205 @@ function printDriverSummary(results) {
   console.log(`${completedCount}/${results.length} personas completed`);
 }
+// src/trace/emitter.ts
+var TraceEmitter = class {
+  store;
+  stack = [];
+  _runId;
+  now;
+  id;
+  constructor(store, options = {}) {
+    this.store = store;
+    this.now = options.now ?? (() => Date.now());
+    this.id = options.id ?? (() => cryptoRandomId());
+    this._runId = options.runId ?? this.id();
+  }
+  get runId() {
+    return this._runId;
+  }
+  // ── Run lifecycle ──────────────────────────────────────────────────
+  async startRun(run) {
+    const full = { ...run, runId: this._runId, startedAt: this.now(), status: "running" };
+    await this.store.appendRun(full);
+    return full;
+  }
+  async endRun(outcome) {
+    const status = outcome?.pass === false ? "failed" : "completed";
+    await this.store.updateRun(this._runId, { endedAt: this.now(), status, outcome });
+  }
+  async abortRun(reason) {
+    await this.store.updateRun(this._runId, {
+      endedAt: this.now(),
+      status: "aborted",
+      outcome: { pass: false, notes: reason }
+    });
+  }
+  // ── Generic span ───────────────────────────────────────────────────
+  async span(init) {
+    const spanId = this.id();
+    const parent = init.parentSpanId ?? this.stack[this.stack.length - 1];
+    const span = {
+      spanId,
+      parentSpanId: parent,
+      runId: this._runId,
+      startedAt: this.now(),
+      ...init
+    };
+    await this.store.appendSpan(span);
+    this.stack.push(spanId);
+    return this.handle(span);
+  }
+  handle(span) {
+    return {
+      span,
+      end: async (patch) => {
+        const endedAt = this.now();
+        await this.store.updateSpan(span.spanId, { endedAt, status: "ok", ...patch });
+        this.pop(span.spanId);
+      },
+      fail: async (error, patch) => {
+        const endedAt = this.now();
+        const errStr = error instanceof Error ? error.message : error;
+        await this.store.updateSpan(span.spanId, {
+          endedAt,
+          status: "error",
+          error: errStr,
+          ...patch
+        });
+        this.pop(span.spanId);
+      }
+    };
+  }
+  pop(spanId) {
+    const idx = this.stack.lastIndexOf(spanId);
+    if (idx >= 0) this.stack.splice(idx, 1);
+  }
+  // ── Typed span conveniences ────────────────────────────────────────
+  llm(init) {
+    return this.span({ kind: "llm", ...init });
+  }
+  tool(init) {
+    return this.span({ kind: "tool", ...init });
+  }
+  retrieval(init) {
+    return this.span({ kind: "retrieval", ...init });
+  }
+  async recordJudge(verdict) {
+    const spanId = this.id();
+    const now = this.now();
+    const full = {
+      spanId,
+      runId: this._runId,
+      kind: "judge",
+      startedAt: now,
+      endedAt: now,
+      status: "ok",
+      ...verdict
+    };
+    await this.store.appendSpan(full);
+    return full;
+  }
+  sandbox(init) {
+    return this.span({ kind: "sandbox", ...init });
+  }
+  // ── Events ─────────────────────────────────────────────────────────
+  async emit(event) {
+    const full = {
+      eventId: this.id(),
+      runId: this._runId,
+      spanId: event.spanId ?? this.stack[this.stack.length - 1],
+      kind: event.kind,
+      timestamp: this.now(),
+      payload: event.payload ?? {}
+    };
+    await this.store.appendEvent(full);
+    return full;
+  }
+  // ── Budget ledger ──────────────────────────────────────────────────
+  async recordBudget(entry) {
+    const full = {
+      runId: this._runId,
+      timestamp: entry.timestamp ?? this.now(),
+      dimension: entry.dimension,
+      limit: entry.limit,
+      consumed: entry.consumed,
+      remaining: entry.remaining,
+      breached: entry.breached,
+      spanId: entry.spanId ?? this.stack[this.stack.length - 1]
+    };
+    await this.store.appendBudgetEntry(full);
+    if (full.breached) {
+      await this.emit({
+        kind: "budget_breach",
+        spanId: full.spanId,
+        payload: { dimension: full.dimension, limit: full.limit, consumed: full.consumed }
+      });
+    }
+    return full;
+  }
+  // ── Artifacts ──────────────────────────────────────────────────────
+  async recordArtifact(artifact) {
+    const full = { artifactId: this.id(), runId: this._runId, ...artifact };
+    await this.store.appendArtifact(full);
+    return full;
+  }
+  // ── Nested composition ─────────────────────────────────────────────
+  /**
+   * Runs `fn` inside a span; auto-ends on success, auto-fails on throw.
+   * Returns the fn's return value. Use this for the 95% case.
+   */
+  async within(init, fn) {
+    const handle = await this.span(init);
+    try {
+      const result = await fn(handle);
+      await handle.end();
+      return result;
+    } catch (err) {
+      await handle.fail(err instanceof Error ? err : String(err));
+      throw err;
+    }
+  }
+};
+function cryptoRandomId() {
+  if (typeof globalThis.crypto?.randomUUID === "function") return globalThis.crypto.randomUUID();
+  return `${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 10)}`;
+}
+function llmSpanFromProvider(args) {
+  return {
+    name: args.name ?? args.model,
+    model: args.model,
+    messages: args.messages,
+    output: args.output,
+    inputTokens: args.usage?.inputTokens,
+    outputTokens: args.usage?.outputTokens,
+    cachedTokens: args.usage?.cachedTokens,
+    reasoningTokens: args.usage?.reasoningTokens,
+    costUsd: args.costUsd,
+    finishReason: args.finishReason
+  };
+}
+// src/control-runtime.ts
+var DEFAULT_BUDGET = {
+  maxSteps: 8,
+  maxWallMs: 5 * 60 * 1e3
+};
+async function runAgentControlLoop(config) {
+  const budget = { ...DEFAULT_BUDGET, ...config.budget };
+  const actionFailure = config.actionFailure ?? "continue";
+  const controller = new AbortController();
+  const upstreamAbort = () => controller.abort(config.signal?.reason);
+  if (config.signal) {
+    if (config.signal.aborted) controller.abort(config.signal.reason);
+    else config.signal.addEventListener("abort", upstreamAbort, { once: true });
+  }
+  const started = Date.now();
+  const wallTimer = budget.maxWallMs ? setTimeout(() => controller.abort(new Error("control runtime wall timeout")), budget.maxWallMs) : void 0;
+  const history = [];
+  const emitter = config.store ? new TraceEmitter(config.store) : void 0;
+  let spentCostUsd = 0;
+  const runtimeErrors = [];
+  let lastStateFingerprint;
+  let lastActionFingerprint;
+  let noProgressStreak = 0;
+  let repeatedActionStreak = 0;
+  try {
+    if (emitter) {
+      await runTrace(runtimeErrors, 0, () => emitter.startRun({
+        scenarioId: config.scenarioId ?? "agent-control-loop",
+        projectId: config.projectId,
+        variantId: config.variantId,
+        layer: "meta",
+        tags: {
+          intent: config.intent.slice(0, 120),
+          maxSteps: String(budget.maxSteps),
+          ...budget.maxCostUsd !== void 0 ? { maxCostUsd: String(budget.maxCostUsd) } : {}
+        }
+      }));
+    }
+    let state;
+    let evals;
+    try {
+      state = await config.observe({ history, abortSignal: controller.signal });
+    } catch (err) {
+      runtimeErrors.push(runtimeError("observe", 0, err));
+      return finish(emitter, {
+        intent: config.intent,
+        pass: false,
+        completed: false,
+        reason: runtimeErrors[0].message,
+        steps: history,
+        finalState: void 0,
+        finalEvals: [],
+        wallMs: Date.now() - started,
+        spentCostUsd,
+        runId: emitter?.runId ?? null,
+        failureClass: "unknown",
+        runtimeErrors,
+        stoppedBy: "runtime-error"
+      });
+    }
+    try {
+      evals = await config.validate({ intent: config.intent, state, history, abortSignal: controller.signal });
+      await recordEvalSpans(emitter, evals, "initial", runtimeErrors, 0);
+    } catch (err) {
+      runtimeErrors.push(runtimeError("validate", 0, err));
+      return finish(emitter, {
+        intent: config.intent,
+        pass: false,
+        completed: false,
+        reason: runtimeErrors[0].message,
+        steps: history,
+        finalState: state,
+        finalEvals: [],
+        wallMs: Date.now() - started,
+        spentCostUsd,
+        runId: emitter?.runId ?? null,
+        failureClass: "unknown",
+        runtimeErrors,
+        stoppedBy: "runtime-error"
+      });
+    }
+    lastStateFingerprint = fingerprintState(state, config.stopPolicies);
+    for (let stepIndex = 0; stepIndex < budget.maxSteps; stepIndex++) {
+      if (controller.signal.aborted) {
+        return finish(emitter, {
+          intent: config.intent,
+          pass: false,
+          completed: false,
+          reason: abortReason(controller.signal),
+          score: void 0,
+          steps: history,
+          finalState: state,
+          finalEvals: evals,
+          wallMs: Date.now() - started,
+          spentCostUsd,
+          runId: emitter?.runId ?? null,
+          failureClass: "timeout",
+          runtimeErrors,
+          stoppedBy: "abort"
+        });
+      }
+      const budgetStop = budgetStopDecision(budget, spentCostUsd);
+      if (budgetStop.stop) {
+        return finish(emitter, {
+          intent: config.intent,
+          pass: false,
+          completed: false,
+          reason: budgetStop.reason,
+          score: averageScore(evals),
+          steps: history,
+          finalState: state,
+          finalEvals: evals,
+          wallMs: Date.now() - started,
+          spentCostUsd,
+          runId: emitter?.runId ?? null,
+          failureClass: "budget_exceeded",
+          runtimeErrors,
+          stoppedBy: "budget"
+        });
+      }
+      const ctx = makeContext(config.intent, state, evals, history, budget, stepIndex, started, spentCostUsd, controller.signal, emitter);
+      let stop;
+      try {
+        stop = config.shouldStop ? await config.shouldStop(ctx) : defaultStopDecision(evals);
+      } catch (err) {
+        runtimeErrors.push(runtimeError("stop-policy", stepIndex, err));
+        return finish(emitter, {
+          intent: config.intent,
+          pass: false,
+          completed: false,
+          reason: runtimeErrors[runtimeErrors.length - 1].message,
+          score: averageScore(evals),
+          steps: history,
+          finalState: state,
+          finalEvals: evals,
+          wallMs: Date.now() - started,
+          spentCostUsd,
+          runId: emitter?.runId ?? null,
+          failureClass: "unknown",
+          runtimeErrors,
+          stoppedBy: "runtime-error"
+        });
+      }
+      if (stop.stop) {
+        return finish(emitter, {
+          intent: config.intent,
+          pass: stop.pass,
+          completed: true,
+          reason: stop.reason,
+          score: stop.score,
+          steps: history,
+          finalState: state,
+          finalEvals: evals,
+          wallMs: Date.now() - started,
+          spentCostUsd,
+          runId: emitter?.runId ?? null,
+          failureClass: stop.failureClass,
+          runtimeErrors,
+          stoppedBy: "stop-policy"
+        });
+      }
+      let decision;
+      try {
+        decision = await config.decide(ctx);
+      } catch (err) {
+        runtimeErrors.push(runtimeError("decide", stepIndex, err));
+        return finish(emitter, {
+          intent: config.intent,
+          pass: false,
+          completed: false,
+          reason: runtimeErrors[runtimeErrors.length - 1].message,
+          score: averageScore(evals),
+          steps: history,
+          finalState: state,
+          finalEvals: evals,
+          wallMs: Date.now() - started,
+          spentCostUsd,
+          runId: emitter?.runId ?? null,
+          failureClass: "unknown",
+          runtimeErrors,
+          stoppedBy: "runtime-error"
+        });
+      }
+      if (decision.type === "stop") {
+        return finish(emitter, {
+          intent: config.intent,
+          pass: decision.pass ?? false,
+          completed: true,
+          reason: decision.reason,
+          score: decision.score,
+          steps: history,
+          finalState: state,
+          finalEvals: evals,
+          wallMs: Date.now() - started,
+          spentCostUsd,
+          runId: emitter?.runId ?? null,
+          failureClass: decision.pass === false ? "unknown" : void 0,
+          runtimeErrors,
+          stoppedBy: "policy"
+        });
+      }
+      const actionFingerprint = fingerprintAction(decision.action, config.stopPolicies);
+      repeatedActionStreak = actionFingerprint === lastActionFingerprint ? repeatedActionStreak + 1 : 1;
+      lastActionFingerprint = actionFingerprint;
+      const repeatedActionStop = repeatedActionStopDecision(config.stopPolicies, repeatedActionStreak);
+      if (repeatedActionStop.stop) {
+        return finish(emitter, {
+          intent: config.intent,
+          pass: false,
+          completed: true,
+          reason: repeatedActionStop.reason,
+          score: averageScore(evals),
+          steps: history,
+          finalState: state,
+          finalEvals: evals,
+          wallMs: Date.now() - started,
+          spentCostUsd,
+          runId: emitter?.runId ?? null,
+          failureClass: "tool_recovery_failure",
+          runtimeErrors,
+          stoppedBy: "stop-policy"
+        });
+      }
+      const beforeState = state;
+      const evalsBefore = evals;
+      const scoreBefore = averageScore(evals);
+      const actionStarted = Date.now();
+      const stepHandle = emitter ? await runTrace(runtimeErrors, stepIndex, () => emitter.tool({
+        name: `control-step-${stepIndex}`,
+        toolName: "agent-control-action",
+        args: decision.action,
+        attributes: {
+          decision: decision.reason ?? "continue",
+          repeatedActionStreak
+        }
+      })) : void 0;
+      let actionOutcome;
+      try {
+        const result = await config.act(decision.action, ctx);
+        const costUsd = config.getActionCostUsd?.({
+          action: decision.action,
+          result,
+          state,
+          evals,
+          history
+        });
+        if (costUsd !== void 0 && Number.isFinite(costUsd) && costUsd > 0) {
+          spentCostUsd += costUsd;
+          await recordCostBudget(emitter, budget, spentCostUsd, stepHandle, runtimeErrors, stepIndex);
+        }
+        actionOutcome = {
+          ok: true,
+          result,
+          ...costUsd !== void 0 ? { costUsd } : {},
+          durationMs: Date.now() - actionStarted
+        };
+      } catch (err) {
+        runtimeErrors.push(runtimeError("act", stepIndex, err));
+        actionOutcome = {
+          ok: false,
+          error: runtimeErrors[runtimeErrors.length - 1].message,
+          durationMs: Date.now() - actionStarted
+        };
+        if (actionFailure === "stop") {
+          await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(actionOutcome.error ?? "action failed"));
+          const step2 = {
+            index: stepIndex,
+            decision,
+            beforeState,
+            afterState: state,
+            evalsBefore,
+            evalsAfter: evals,
+            actionOutcome,
+            startedAt: new Date(actionStarted).toISOString(),
+            endedAt: (/* @__PURE__ */ new Date()).toISOString()
+          };
+          history.push(step2);
+          await runOnStep(config.onStep, step2, runtimeErrors);
+          return finish(emitter, {
+            intent: config.intent,
+            pass: false,
+            completed: false,
+            reason: actionOutcome.error ?? "action failed",
+            score: averageScore(evals),
+            steps: history,
+            finalState: state,
+            finalEvals: evals,
+            wallMs: Date.now() - started,
+            spentCostUsd,
+            runId: emitter?.runId ?? null,
+            failureClass: "unknown",
+            runtimeErrors,
+            stoppedBy: "runtime-error"
+          });
+        }
+      }
+      try {
+        state = await config.observe({ history, abortSignal: controller.signal });
+      } catch (err) {
+        runtimeErrors.push(runtimeError("observe", stepIndex, err));
+        const step2 = {
+          index: stepIndex,
+          decision,
+          beforeState,
+          afterState: beforeState,
+          evalsBefore,
+          evalsAfter: evals,
+          actionOutcome,
+          startedAt: new Date(actionStarted).toISOString(),
+          endedAt: (/* @__PURE__ */ new Date()).toISOString()
+        };
+        history.push(step2);
+        await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(runtimeErrors[runtimeErrors.length - 1].message));
+        await runOnStep(config.onStep, step2, runtimeErrors);
+        return finish(emitter, {
+          intent: config.intent,
+          pass: false,
+          completed: false,
+          reason: runtimeErrors[runtimeErrors.length - 1].message,
+          score: averageScore(evals),
+          steps: history,
+          finalState: beforeState,
+          finalEvals: evals,
+          wallMs: Date.now() - started,
+          spentCostUsd,
+          runId: emitter?.runId ?? null,
+          failureClass: "unknown",
+          runtimeErrors,
+          stoppedBy: "runtime-error"
+        });
+      }
+      try {
+        evals = await config.validate({ intent: config.intent, state, history, abortSignal: controller.signal });
+        await recordEvalSpans(emitter, evals, `step-${stepIndex}`, runtimeErrors, stepIndex, stepHandle?.span.spanId);
+      } catch (err) {
+        runtimeErrors.push(runtimeError("validate", stepIndex, err));
+        const step2 = {
+          index: stepIndex,
+          decision,
+          beforeState,
+          afterState: state,
+          evalsBefore,
+          evalsAfter: evals,
+          actionOutcome,
+          startedAt: new Date(actionStarted).toISOString(),
+          endedAt: (/* @__PURE__ */ new Date()).toISOString()
+        };
+        history.push(step2);
+        await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(runtimeErrors[runtimeErrors.length - 1].message));
+        await runOnStep(config.onStep, step2, runtimeErrors);
+        return finish(emitter, {
+          intent: config.intent,
+          pass: false,
+          completed: false,
+          reason: runtimeErrors[runtimeErrors.length - 1].message,
+          score: averageScore(evals),
+          steps: history,
+          finalState: state,
+          finalEvals: evals,
+          wallMs: Date.now() - started,
+          spentCostUsd,
+          runId: emitter?.runId ?? null,
+          failureClass: "unknown",
+          runtimeErrors,
+          stoppedBy: "runtime-error"
+        });
+      }
+      const scoreAfter = averageScore(evals);
+      const stateFingerprint = fingerprintState(state, config.stopPolicies);
+      const noProgressStop = noProgressStopDecision({
+        policies: config.stopPolicies,
+        lastStateFingerprint,
+        stateFingerprint,
+        scoreBefore,
+        scoreAfter,
+        currentStreak: noProgressStreak
+      });
+      noProgressStreak = noProgressStop.streak;
+      lastStateFingerprint = stateFingerprint;
+      const step = {
+        index: stepIndex,
+        decision,
+        beforeState,
+        afterState: state,
+        evalsBefore,
+        evalsAfter: evals,
+        actionOutcome,
+        startedAt: new Date(actionStarted).toISOString(),
+        endedAt: (/* @__PURE__ */ new Date()).toISOString()
+      };
+      history.push(step);
+      if (actionOutcome.ok) {
+        await runTrace(runtimeErrors, stepIndex, () => stepHandle?.end({
+          attributes: {
+            actionCostUsd: actionOutcome.costUsd ?? null,
+            spentCostUsd,
+            scoreBefore: scoreBefore ?? null,
+            scoreAfter: scoreAfter ?? null,
+            noProgressStreak
+          }
+        }));
+      } else {
+        await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(actionOutcome.error ?? "action failed", {
+          attributes: {
+            spentCostUsd,
+            noProgressStreak
+          }
+        }));
+      }
+      await runOnStep(config.onStep, step, runtimeErrors);
+      if (noProgressStop.stop) {
+        return finish(emitter, {
+          intent: config.intent,
+          pass: false,
+          completed: true,
+          reason: noProgressStop.reason,
+          score: scoreAfter,
+          steps: history,
+          finalState: state,
+          finalEvals: evals,
+          wallMs: Date.now() - started,
+          spentCostUsd,
+          runId: emitter?.runId ?? null,
+          failureClass: "tool_recovery_failure",
+          runtimeErrors,
+          stoppedBy: "stop-policy"
+        });
+      }
+      const postStepBudgetStop = budgetStopDecision(budget, spentCostUsd);
+      if (postStepBudgetStop.stop) {
+        return finish(emitter, {
+          intent: config.intent,
+          pass: false,
+          completed: false,
+          reason: postStepBudgetStop.reason,
+          score: scoreAfter,
+          steps: history,
+          finalState: state,
+          finalEvals: evals,
+          wallMs: Date.now() - started,
+          spentCostUsd,
+          runId: emitter?.runId ?? null,
+          failureClass: "budget_exceeded",
+          runtimeErrors,
+          stoppedBy: "budget"
+        });
+      }
+      const postStepCtx = makeContext(config.intent, state, evals, history, budget, stepIndex + 1, started, spentCostUsd, controller.signal, emitter);
+      let postStepStop;
+      try {
+        postStepStop = config.shouldStop ? await config.shouldStop(postStepCtx) : defaultStopDecision(evals);
+      } catch (err) {
+        runtimeErrors.push(runtimeError("stop-policy", stepIndex + 1, err));
+        return finish(emitter, {
+          intent: config.intent,
+          pass: false,
+          completed: false,
+          reason: runtimeErrors[runtimeErrors.length - 1].message,
+          score: averageScore(evals),
+          steps: history,
+          finalState: state,
+          finalEvals: evals,
+          wallMs: Date.now() - started,
+          spentCostUsd,
+          runId: emitter?.runId ?? null,
+          failureClass: "unknown",
+          runtimeErrors,
+          stoppedBy: "runtime-error"
+        });
+      }
+      if (postStepStop.stop) {
+        return finish(emitter, {
+          intent: config.intent,
+          pass: postStepStop.pass,
+          completed: true,
+          reason: postStepStop.reason,
+          score: postStepStop.score,
+          steps: history,
+          finalState: state,
+          finalEvals: evals,
+          wallMs: Date.now() - started,
+          spentCostUsd,
+          runId: emitter?.runId ?? null,
+          failureClass: postStepStop.failureClass,
+          runtimeErrors,
+          stoppedBy: "stop-policy"
+        });
+      }
+    }
+    return finish(emitter, {
+      intent: config.intent,
+      pass: false,
+      completed: false,
+      reason: `budget exhausted: maxSteps=${budget.maxSteps}`,
+      steps: history,
+      finalState: state,
+      finalEvals: evals,
+      wallMs: Date.now() - started,
+      spentCostUsd,
+      runId: emitter?.runId ?? null,
+      failureClass: "budget_exceeded",
+      runtimeErrors,
+      stoppedBy: "budget"
+    });
+  } catch (err) {
+    runtimeErrors.push(runtimeError("act", history.length, err));
+    return finish(emitter, {
+      intent: config.intent,
+      pass: false,
+      completed: false,
+      reason: runtimeErrors[runtimeErrors.length - 1].message,
+      steps: history,
+      finalState: void 0,
+      finalEvals: [],
+      wallMs: Date.now() - started,
+      spentCostUsd,
+      runId: emitter?.runId ?? null,
+      failureClass: "unknown",
+      runtimeErrors,
+      stoppedBy: "runtime-error"
+    });
+  } finally {
+    if (wallTimer) clearTimeout(wallTimer);
+    if (config.signal) config.signal.removeEventListener("abort", upstreamAbort);
+  }
+}
+function stopOnNoProgress(maxNoProgressSteps, options = {}) {
+  return { ...options, maxNoProgressSteps };
+}
+function stopOnRepeatedAction(maxRepeatedActions, options = {}) {
+  return { ...options, maxRepeatedActions };
+}
+function objectiveEval(input) {
+  return { ...input, objective: true };
+}
+function subjectiveEval(input) {
+  return { ...input, objective: false };
+}
+function allCriticalPassed(evals) {
+  return evals.every((result) => result.passed || result.severity !== "critical" && result.severity !== "error");
+}
+function makeContext(intent, state, evals, history, budget, stepIndex, started, spentCostUsd, abortSignal, emitter) {
+  return {
+    intent,
+    state,
+    evals,
+    history,
+    budget,
+    stepIndex,
+    wallMs: Date.now() - started,
+    spentCostUsd,
+    remainingCostUsd: budget.maxCostUsd === void 0 ? void 0 : Math.max(0, budget.maxCostUsd - spentCostUsd),
+    abortSignal,
+    emitter
+  };
+}
+function defaultStopDecision(evals) {
+  if (!evals.length) return { stop: false, pass: false, reason: "no evals yet" };
+  const pass = allCriticalPassed(evals);
+  return pass ? { stop: true, pass: true, reason: "all critical evals passed", score: averageScore(evals) } : { stop: false, pass: false, reason: "critical evals still failing", score: averageScore(evals) };
+}
+function averageScore(evals) {
+  const scored = evals.map((result) => result.score).filter((score) => typeof score === "number");
+  if (!scored.length) return void 0;
+  return Math.round(scored.reduce((sum2, score) => sum2 + score, 0) / scored.length * 1e3) / 1e3;
+}
+function budgetStopDecision(budget, spentCostUsd) {
+  if (budget.maxCostUsd !== void 0 && spentCostUsd >= budget.maxCostUsd) {
+    return {
+      stop: true,
+      reason: `budget exhausted: maxCostUsd=${budget.maxCostUsd}`
+    };
+  }
+  return { stop: false, reason: "" };
+}
+async function recordCostBudget(emitter, budget, spentCostUsd, handle, runtimeErrors, stepIndex) {
+  if (!emitter || budget.maxCostUsd === void 0) return;
+  const maxCostUsd = budget.maxCostUsd;
+  await runTrace(runtimeErrors, stepIndex, () => emitter.recordBudget({
+    dimension: "usd",
+    limit: maxCostUsd,
+    consumed: spentCostUsd,
+    remaining: Math.max(0, maxCostUsd - spentCostUsd),
+    breached: spentCostUsd >= maxCostUsd,
+    spanId: handle?.span.spanId
+  }));
+}
+async function recordEvalSpans(emitter, evals, phase, runtimeErrors, stepIndex, targetSpanId) {
+  if (!emitter) return;
+  for (const result of evals) {
+    await runTrace(runtimeErrors, stepIndex, () => emitter.recordJudge({
+      judgeId: result.objective ? "objective-validator" : "subjective-judge",
+      targetSpanId: targetSpanId ?? emitter.runId,
+      name: `control-eval/${result.id}`,
+      dimension: result.id,
+      score: typeof result.score === "number" ? result.score : result.passed ? 1 : 0,
+      rationale: result.detail,
+      evidence: result.evidence,
+      attributes: {
+        phase,
+        passed: result.passed,
+        severity: result.severity,
+        objective: result.objective
+      }
+    }));
+  }
+}
+async function runOnStep(onStep, step, runtimeErrors) {
+  if (!onStep) return;
+  try {
+    await onStep(step);
+  } catch (err) {
+    runtimeErrors.push(runtimeError("on-step", step.index, err));
+  }
+}
+async function runTrace(runtimeErrors, stepIndex, write) {
+  try {
+    return await write();
+  } catch (err) {
+    runtimeErrors.push(runtimeError("trace", stepIndex, err));
+    return void 0;
+  }
+}
+function noProgressStopDecision(args) {
+  const max = args.policies?.maxNoProgressSteps;
+  if (!max || max <= 0) return { stop: false, reason: "", streak: 0 };
+  const minScoreDelta = args.policies?.minScoreDelta ?? 1e-3;
+  const scoreDelta = Math.abs((args.scoreAfter ?? 0) - (args.scoreBefore ?? 0));
+  const stateUnchanged = args.lastStateFingerprint !== void 0 && args.lastStateFingerprint === args.stateFingerprint;
+  const scoreFlat = scoreDelta < minScoreDelta;
+  const streak = stateUnchanged && scoreFlat ? args.currentStreak + 1 : 0;
+  return streak >= max ? { stop: true, reason: `stuck: no state/score progress for ${streak} step(s)`, streak } : { stop: false, reason: "", streak };
+}
+function repeatedActionStopDecision(policies, streak) {
+  const max = policies?.maxRepeatedActions;
+  if (!max || max <= 0 || streak < max) return { stop: false, reason: "" };
+  return {
+    stop: true,
+    reason: `stuck: repeated same action for ${streak} step(s)`
+  };
+}
+function fingerprintState(state, policies) {
+  if (policies?.stateFingerprint) return policies.stateFingerprint(state);
+  return stableFingerprint(state);
+}
+function fingerprintAction(action, policies) {
+  if (policies?.actionFingerprint) return policies.actionFingerprint(action);
+  return stableFingerprint(action);
+}
+function stableFingerprint(value) {
+  if (typeof value === "string") return value;
+  if (typeof value === "number" || typeof value === "boolean" || value == null) return String(value);
+  try {
+    return JSON.stringify(sortForFingerprint(value));
+  } catch {
+    return String(value);
+  }
+}
+function sortForFingerprint(value) {
+  if (Array.isArray(value)) return value.map(sortForFingerprint);
+  if (!value || typeof value !== "object") return value;
+  const record = value;
+  const sorted = {};
+  for (const key of Object.keys(record).sort()) {
+    sorted[key] = sortForFingerprint(record[key]);
+  }
+  return sorted;
+}
+function abortReason(signal) {
+  const reason = signal.reason;
+  if (reason instanceof Error) return reason.message;
+  return reason ? String(reason) : "aborted";
+}
+function runtimeError(phase, stepIndex, err) {
+  const message = err instanceof Error ? err.message : String(err);
+  return { phase, stepIndex, message };
+}
+async function finish(emitter, result) {
+  await runTrace(result.runtimeErrors, result.steps.length, () => emitter?.endRun({
+    pass: result.pass,
+    score: result.score ?? averageScore(result.finalEvals),
+    failureClass: result.failureClass,
+    notes: result.reason
+  }));
+  return result;
+}
+// src/feedback-trajectory.ts
+import { appendFile, mkdir, readFile } from "fs/promises";
+import { join } from "path";
+var DEFAULT_SPLIT_POLICY = {
+  trainPct: 70,
+  devPct: 15,
+  testPct: 10,
+  holdoutPct: 5
+};
+var InMemoryFeedbackTrajectoryStore = class {
+  trajectories = /* @__PURE__ */ new Map();
+  async save(trajectory) {
+    this.trajectories.set(trajectory.id, cloneTrajectory(trajectory));
+  }
+  async get(id) {
+    const trajectory = this.trajectories.get(id);
+    return trajectory ? cloneTrajectory(trajectory) : null;
+  }
+  async list(filter = {}) {
+    return [...this.trajectories.values()].filter((trajectory) => matchesFilter(trajectory, filter)).map(cloneTrajectory);
+  }
+  async appendAttempt(id, attempt) {
+    const trajectory = this.trajectories.get(id);
+    if (!trajectory) throw new Error(`FeedbackTrajectoryStore.appendAttempt: unknown trajectory "${id}"`);
+    const next = cloneTrajectory({
+      ...trajectory,
+      attempts: [...trajectory.attempts, attempt],
+      updatedAt: attempt.createdAt
+    });
+    this.trajectories.set(id, next);
+    return cloneTrajectory(next);
+  }
+  async appendLabel(id, label, attemptId) {
+    const trajectory = this.trajectories.get(id);
+    if (!trajectory) throw new Error(`FeedbackTrajectoryStore.appendLabel: unknown trajectory "${id}"`);
+    const attempts = attemptId ? trajectory.attempts.map((attempt) => attempt.id === attemptId ? { ...attempt, feedback: [...attempt.feedback ?? [], label] } : attempt) : trajectory.attempts;
+    const next = cloneTrajectory({
+      ...trajectory,
+      attempts,
+      labels: attemptId ? trajectory.labels : [...trajectory.labels, label],
+      updatedAt: label.createdAt
+    });
+    this.trajectories.set(id, next);
+    return cloneTrajectory(next);
+  }
+};
+var FileSystemFeedbackTrajectoryStore = class {
+  dir;
+  memory = new InMemoryFeedbackTrajectoryStore();
+  loaded = false;
+  constructor(options) {
+    this.dir = options.dir;
+  }
+  async save(trajectory) {
+    await this.load();
+    await this.memory.save(trajectory);
+    await this.append({ op: "save", trajectory });
+  }
+  async get(id) {
+    await this.load();
+    return this.memory.get(id);
+  }
+  async list(filter = {}) {
+    await this.load();
+    return this.memory.list(filter);
+  }
+  async appendAttempt(id, attempt) {
+    await this.load();
+    const next = await this.memory.appendAttempt(id, attempt);
+    await this.append({ op: "appendAttempt", id, attempt });
+    return next;
+  }
+  async appendLabel(id, label, attemptId) {
+    await this.load();
+    const next = await this.memory.appendLabel(id, label, attemptId);
+    await this.append({ op: "appendLabel", id, label, attemptId });
+    return next;
+  }
+  async append(record) {
+    await mkdir(this.dir, { recursive: true });
+    await appendFile(join(this.dir, "feedback-trajectories.ndjson"), JSON.stringify(record) + "\n", "utf8");
+  }
+  async load() {
+    if (this.loaded) return;
+    const file = join(this.dir, "feedback-trajectories.ndjson");
+    try {
+      const raw = await readFile(file, "utf8");
+      for (const line of raw.split("\n")) {
+        if (!line.trim()) continue;
+        try {
+          const record = JSON.parse(line);
+          if (record.op === "save") await this.memory.save(record.trajectory);
+          if (record.op === "appendAttempt") await this.memory.appendAttempt(record.id, record.attempt);
+          if (record.op === "appendLabel") await this.memory.appendLabel(record.id, record.label, record.attemptId);
+        } catch {
+        }
+      }
+    } catch {
+    }
+    this.loaded = true;
+  }
+};
+function createFeedbackTrajectory(input) {
+  const createdAt = input.createdAt ?? (/* @__PURE__ */ new Date()).toISOString();
+  const id = input.id ?? `ft_${stableHash(`${input.projectId ?? ""}|${input.scenarioId ?? ""}|${input.task.intent}|${createdAt}`).toString(16)}`;
+  return {
+    id,
+    projectId: input.projectId,
+    scenarioId: input.scenarioId,
+    task: input.task,
+    attempts: input.attempts ?? [],
+    labels: input.labels ?? [],
+    outcome: input.outcome,
+    split: input.split,
+    tags: input.tags,
+    createdAt,
+    metadata: input.metadata
+  };
+}
+function assignFeedbackSplit(trajectory, policy = {}) {
+  const split = { ...DEFAULT_SPLIT_POLICY, ...policy };
+  const total = split.trainPct + split.devPct + split.testPct + split.holdoutPct;
+  if (total <= 0) throw new Error("assignFeedbackSplit: split percentages must sum above zero");
+  const bucket = stableHash(`${trajectory.projectId ?? ""}|${trajectory.scenarioId ?? ""}|${trajectory.id}|${trajectory.task.intent}`) % total;
+  if (bucket < split.trainPct) return "train";
+  if (bucket < split.trainPct + split.devPct) return "dev";
+  if (bucket < split.trainPct + split.devPct + split.testPct) return "test";
+  return "holdout";
+}
+function withAssignedFeedbackSplit(trajectory, policy) {
+  return {
+    ...trajectory,
+    split: trajectory.split ?? assignFeedbackSplit(trajectory, policy)
+  };
+}
+function feedbackTrajectoryToDatasetScenario(trajectory) {
+  const withSplit = withAssignedFeedbackSplit(trajectory);
+  return {
+    id: withSplit.scenarioId ?? withSplit.id,
+    split: withSplit.split,
+    payload: withSplit,
+    tags: {
+      ...withSplit.projectId ? { projectId: withSplit.projectId } : {},
+      ...withSplit.tags ?? {},
+      source: "feedback-trajectory"
+    }
+  };
+}
+function feedbackTrajectoriesToDatasetScenarios(trajectories) {
+  return trajectories.map(feedbackTrajectoryToDatasetScenario);
+}
+function feedbackTrajectoryToOptimizerRow(trajectory) {
+  const labels = allLabels(trajectory);
+  return {
+    scenarioId: trajectory.scenarioId ?? trajectory.id,
+    trajectoryId: trajectory.id,
+    labelKinds: [...new Set(labels.map((label) => label.kind))],
+    score: trajectory.outcome?.score ?? scoreFromLabels(labels),
+    metadata: {
+      projectId: trajectory.projectId,
+      split: trajectory.split,
+      intent: trajectory.task.intent,
+      attempts: trajectory.attempts.length,
+      outcome: trajectory.outcome,
+      labels
+    }
+  };
+}
+function feedbackTrajectoriesToOptimizerRows(trajectories) {
+  return trajectories.map(feedbackTrajectoryToOptimizerRow);
+}
+function summarizePreferenceMemory(trajectories, options = {}) {
+  const maxEntries = options.maxEntries ?? 20;
+  const entries = [];
+  for (const trajectory of trajectories) {
+    for (const label of allLabels(trajectory)) {
+      const instruction = instructionFromLabel(trajectory, label);
+      if (!instruction) continue;
+      entries.push({
+        instruction,
+        rationale: label.reason ?? `${label.kind} label from ${label.source}`,
+        weight: weightForLabel(label),
+        sourceTrajectoryId: trajectory.id,
+        sourceLabelId: label.id,
+        category: label.kind
+      });
+    }
+  }
+  const byInstruction = /* @__PURE__ */ new Map();
+  for (const entry of entries) {
+    const key = entry.instruction.toLowerCase().replace(/\s+/g, " ").trim();
+    const existing = byInstruction.get(key);
+    if (!existing || entry.weight > existing.weight) byInstruction.set(key, entry);
+  }
+  return [...byInstruction.values()].sort((a, b) => b.weight - a.weight).slice(0, maxEntries);
+}
+function renderPreferenceMemoryMarkdown(entries) {
+  const lines = ["# Preference Memory", ""];
+  for (const entry of entries) {
+    lines.push(`- ${entry.instruction}`);
+    lines.push(`  Rationale: ${entry.rationale}`);
+    lines.push(`  Source: ${entry.sourceTrajectoryId}`);
+    lines.push("");
+  }
+  return lines.join("\n").trim() + "\n";
+}
+function serializeFeedbackTrajectoriesJsonl(trajectories) {
+  return trajectories.slice().sort((a, b) => a.id.localeCompare(b.id)).map((trajectory) => JSON.stringify(canonicalize(trajectory))).join("\n") + "\n";
+}
+function parseFeedbackTrajectoriesJsonl(jsonl) {
+  const trajectories = [];
+  for (const line of jsonl.split("\n")) {
+    if (!line.trim()) continue;
+    trajectories.push(JSON.parse(line));
+  }
+  return trajectories;
+}
+function controlRunToFeedbackTrajectory(run, options = {}) {
+  const createdAt = options.createdAt ?? (/* @__PURE__ */ new Date()).toISOString();
+  const trajectoryId = run.runId ?? `ft_control_${stableHash(`${run.intent}|${createdAt}`).toString(16)}`;
+  return createFeedbackTrajectory({
+    id: trajectoryId,
+    projectId: options.projectId,
+    scenarioId: options.scenarioId,
+    task: { intent: run.intent },
+    createdAt,
+    attempts: run.steps.map((step) => ({
+      id: `${trajectoryId}_step_${step.index}`,
+      stepIndex: step.index,
+      artifactType: options.artifactType ?? "action",
+      artifact: options.artifactFromStep?.(step) ?? step.actionOutcome?.result ?? step.decision,
+      proposedAction: options.proposedActionFromStep?.(step),
+      evals: step.evalsAfter,
+      createdAt: step.startedAt,
+      metadata: {
+        decision: step.decision,
+        actionOutcome: step.actionOutcome
+      }
+    })),
+    labels: [
+      {
+        source: "system",
+        kind: run.pass ? "approve" : "reject",
+        value: run.pass,
+        reason: run.reason,
+        severity: run.pass ? "info" : "error",
+        createdAt
+      }
+    ],
+    outcome: {
+      success: run.pass,
+      score: run.score,
+      costUsd: run.spentCostUsd,
+      detail: run.reason,
+      observedAt: createdAt,
+      metadata: {
+        stoppedBy: run.stoppedBy,
+        failureClass: run.failureClass
+      }
+    }
+  });
+}
+function allLabels(trajectory) {
+  const labels = [
+    ...trajectory.labels,
+    ...trajectory.attempts.flatMap((attempt) => attempt.feedback ?? [])
+  ];
+  const seen = /* @__PURE__ */ new Set();
+  return labels.filter((label) => {
+    const key = label.id ?? `${label.source}|${label.kind}|${label.createdAt}|${JSON.stringify(label.value)}`;
+    if (seen.has(key)) return false;
+    seen.add(key);
+    return true;
+  });
+}
+function scoreFromLabels(labels) {
+  if (!labels.length) return void 0;
+  const scored = labels.map((label) => {
+    if (label.kind === "approve" || label.kind === "select") return 1;
+    if (label.kind === "reject" || label.kind === "policy_block") return 0;
+    if (label.kind === "rate" && typeof label.value === "number") return Math.max(0, Math.min(1, label.value));
+    return void 0;
+  }).filter((value) => typeof value === "number");
+  if (!scored.length) return void 0;
+  return Math.round(scored.reduce((sum2, value) => sum2 + value, 0) / scored.length * 1e3) / 1e3;
+}
+function instructionFromLabel(trajectory, label) {
+  if (label.kind === "reject" && label.reason) return `Avoid outputs like "${compact(trajectory.task.intent, 80)}" when: ${label.reason}`;
+  if (label.kind === "revision_request" && label.reason) return `Revise similar work by applying: ${label.reason}`;
+  if (label.kind === "select" && label.reason) return `Prefer selected options for "${compact(trajectory.task.intent, 80)}" because: ${label.reason}`;
+  if (label.kind === "approve" && label.reason) return `Repeat the pattern approved for "${compact(trajectory.task.intent, 80)}": ${label.reason}`;
+  if (label.kind === "comment" && label.reason) return label.reason;
+  return void 0;
+}
+function weightForLabel(label) {
+  const severity = label.severity === "critical" ? 4 : label.severity === "error" ? 3 : label.severity === "warning" ? 2 : 1;
+  const source = label.source === "user" ? 3 : label.source === "metric" || label.source === "environment" ? 2 : 1;
+  return severity * source;
+}
+function matchesFilter(trajectory, filter) {
+  if (filter.projectId && trajectory.projectId !== filter.projectId) return false;
+  if (filter.scenarioId && trajectory.scenarioId !== filter.scenarioId) return false;
+  if (filter.split && trajectory.split !== filter.split) return false;
+  if (filter.tag) {
+    const [key, value] = filter.tag;
+    if (trajectory.tags?.[key] !== value) return false;
+  }
+  return true;
+}
+function cloneTrajectory(trajectory) {
+  return JSON.parse(JSON.stringify(trajectory));
+}
+function compact(value, max) {
+  const normalized = value.replace(/\s+/g, " ").trim();
+  return normalized.length > max ? `${normalized.slice(0, max).trim()}...` : normalized;
+}
+function stableHash(input) {
+  let hash = 2166136261;
+  for (let i = 0; i < input.length; i += 1) {
+    hash ^= input.charCodeAt(i);
+    hash = Math.imul(hash, 16777619);
+  }
+  return hash >>> 0;
+}
+function canonicalize(value) {
+  if (value === null || typeof value !== "object") return value;
+  if (Array.isArray(value)) return value.map(canonicalize);
+  const out = {};
+  for (const key of Object.keys(value).sort()) {
+    out[key] = canonicalize(value[key]);
+  }
+  return out;
+}
 // src/prompt-registry.ts
 var PromptRegistry = class {
   entries = /* @__PURE__ */ new Map();
@@ -3053,231 +4252,53 @@ var FileSystemTraceStore = class {
         }
       }
     } catch {
-    }
-    this.index = store;
-    this.loaded = true;
-    return store;
-  }
-  async appendRun(run) {
-    await this.append("runs", run);
-  }
-  async updateRun(runId, patch) {
-    await this.append("runs", { runId, ...patch, _update: true });
-    if (this.index) await this.index.updateRun(runId, patch);
-  }
-  async appendSpan(span) {
-    await this.append("spans", span);
-  }
-  async updateSpan(spanId, patch) {
-    await this.append("spans", { spanId, ...patch, _update: true });
-    if (this.index) await this.index.updateSpan(spanId, patch);
-  }
-  async appendEvent(event) {
-    await this.append("events", event);
-  }
-  async appendArtifact(artifact) {
-    await this.append("artifacts", artifact);
-  }
-  async appendBudgetEntry(entry) {
-    await this.append("budget", entry);
-  }
-  async getRun(runId) {
-    return (await this.load()).getRun(runId);
-  }
-  async listRuns(filter) {
-    return (await this.load()).listRuns(filter);
-  }
-  async spans(filter) {
-    return (await this.load()).spans(filter);
-  }
-  async events(filter) {
-    return (await this.load()).events(filter);
-  }
-  async budget(runId) {
-    return (await this.load()).budget(runId);
-  }
-  async artifacts(runId) {
-    return (await this.load()).artifacts(runId);
-  }
-};
-// src/trace/emitter.ts
-var TraceEmitter = class {
-  store;
-  stack = [];
-  _runId;
-  now;
-  id;
-  constructor(store, options = {}) {
-    this.store = store;
-    this.now = options.now ?? (() => Date.now());
-    this.id = options.id ?? (() => cryptoRandomId());
-    this._runId = options.runId ?? this.id();
-  }
-  get runId() {
-    return this._runId;
-  }
-  // ── Run lifecycle ──────────────────────────────────────────────────
-  async startRun(run) {
-    const full = { ...run, runId: this._runId, startedAt: this.now(), status: "running" };
-    await this.store.appendRun(full);
-    return full;
-  }
-  async endRun(outcome) {
-    const status = outcome?.pass === false ? "failed" : "completed";
-    await this.store.updateRun(this._runId, { endedAt: this.now(), status, outcome });
+    }
+    this.index = store;
+    this.loaded = true;
+    return store;
   }
-  async abortRun(reason) {
-    await this.store.updateRun(this._runId, {
-      endedAt: this.now(),
-      status: "aborted",
-      outcome: { pass: false, notes: reason }
-    });
+  async appendRun(run) {
+    await this.append("runs", run);
   }
-  // ── Generic span ───────────────────────────────────────────────────
-  async span(init) {
-    const spanId = this.id();
-    const parent = init.parentSpanId ?? this.stack[this.stack.length - 1];
-    const span = {
-      spanId,
-      parentSpanId: parent,
-      runId: this._runId,
-      startedAt: this.now(),
-      ...init
-    };
-    await this.store.appendSpan(span);
-    this.stack.push(spanId);
-    return this.handle(span);
+  async updateRun(runId, patch) {
+    await this.append("runs", { runId, ...patch, _update: true });
+    if (this.index) await this.index.updateRun(runId, patch);
   }
-  handle(span) {
-    return {
-      span,
-      end: async (patch) => {
-        const endedAt = this.now();
-        await this.store.updateSpan(span.spanId, { endedAt, status: "ok", ...patch });
-        this.pop(span.spanId);
-      },
-      fail: async (error, patch) => {
-        const endedAt = this.now();
-        const errStr = error instanceof Error ? error.message : error;
-        await this.store.updateSpan(span.spanId, {
-          endedAt,
-          status: "error",
-          error: errStr,
-          ...patch
-        });
-        this.pop(span.spanId);
-      }
-    };
+  async appendSpan(span) {
+    await this.append("spans", span);
   }
-  pop(spanId) {
-    const idx = this.stack.lastIndexOf(spanId);
-    if (idx >= 0) this.stack.splice(idx, 1);
+  async updateSpan(spanId, patch) {
+    await this.append("spans", { spanId, ...patch, _update: true });
+    if (this.index) await this.index.updateSpan(spanId, patch);
   }
-  // ── Typed span conveniences ────────────────────────────────────────
-  llm(init) {
-    return this.span({ kind: "llm", ...init });
+  async appendEvent(event) {
+    await this.append("events", event);
   }
-  tool(init) {
-    return this.span({ kind: "tool", ...init });
+  async appendArtifact(artifact) {
+    await this.append("artifacts", artifact);
   }
-  retrieval(init) {
-    return this.span({ kind: "retrieval", ...init });
+  async appendBudgetEntry(entry) {
+    await this.append("budget", entry);
   }
-  async recordJudge(verdict) {
-    const spanId = this.id();
-    const now = this.now();
-    const full = {
-      spanId,
-      runId: this._runId,
-      kind: "judge",
-      startedAt: now,
-      endedAt: now,
-      status: "ok",
-      ...verdict
-    };
-    await this.store.appendSpan(full);
-    return full;
+  async getRun(runId) {
+    return (await this.load()).getRun(runId);
   }
-  sandbox(init) {
-    return this.span({ kind: "sandbox", ...init });
+  async listRuns(filter) {
+    return (await this.load()).listRuns(filter);
   }
-  // ── Events ─────────────────────────────────────────────────────────
-  async emit(event) {
-    const full = {
-      eventId: this.id(),
-      runId: this._runId,
-      spanId: event.spanId ?? this.stack[this.stack.length - 1],
-      kind: event.kind,
-      timestamp: this.now(),
-      payload: event.payload ?? {}
-    };
-    await this.store.appendEvent(full);
-    return full;
+  async spans(filter) {
+    return (await this.load()).spans(filter);
   }
-  // ── Budget ledger ──────────────────────────────────────────────────
-  async recordBudget(entry) {
-    const full = {
-      runId: this._runId,
-      timestamp: entry.timestamp ?? this.now(),
-      dimension: entry.dimension,
-      limit: entry.limit,
-      consumed: entry.consumed,
-      remaining: entry.remaining,
-      breached: entry.breached,
-      spanId: entry.spanId ?? this.stack[this.stack.length - 1]
-    };
-    await this.store.appendBudgetEntry(full);
-    if (full.breached) {
-      await this.emit({
-        kind: "budget_breach",
-        spanId: full.spanId,
-        payload: { dimension: full.dimension, limit: full.limit, consumed: full.consumed }
-      });
-    }
-    return full;
+  async events(filter) {
+    return (await this.load()).events(filter);
   }
-  // ── Artifacts ──────────────────────────────────────────────────────
-  async recordArtifact(artifact) {
-    const full = { artifactId: this.id(), runId: this._runId, ...artifact };
-    await this.store.appendArtifact(full);
-    return full;
+  async budget(runId) {
+    return (await this.load()).budget(runId);
   }
-  // ── Nested composition ─────────────────────────────────────────────
-  /**
-   * Runs `fn` inside a span; auto-ends on success, auto-fails on throw.
-   * Returns the fn's return value. Use this for the 95% case.
-   */
-  async within(init, fn) {
-    const handle = await this.span(init);
-    try {
-      const result = await fn(handle);
-      await handle.end();
-      return result;
-    } catch (err) {
-      await handle.fail(err instanceof Error ? err : String(err));
-      throw err;
-    }
+  async artifacts(runId) {
+    return (await this.load()).artifacts(runId);
   }
 };
-function cryptoRandomId() {
-  if (typeof globalThis.crypto?.randomUUID === "function") return globalThis.crypto.randomUUID();
-  return `${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 10)}`;
-}
-function llmSpanFromProvider(args) {
-  return {
-    name: args.name ?? args.model,
-    model: args.model,
-    messages: args.messages,
-    output: args.output,
-    inputTokens: args.usage?.inputTokens,
-    outputTokens: args.usage?.outputTokens,
-    cachedTokens: args.usage?.cachedTokens,
-    reasoningTokens: args.usage?.reasoningTokens,
-    costUsd: args.costUsd,
-    finishReason: args.finishReason
-  };
-}
 // src/sandbox-harness.ts
 var vitestTestParser = {
@@ -3887,6 +4908,157 @@ function safeJson(x) {
   }
 }
+// src/propose-review-control.ts
+var DEFAULT_FALLBACK_INSTRUCTION2 = "Inspect the verification failures above. Fix the critical issues first, then the major ones. Do not restate the failures \u2014 act on them.";
+async function runProposeReviewAsControlLoop(config) {
+  const maxShots = config.maxShots ?? 10;
+  const confidenceFloor = config.confidenceFloor ?? 0.3;
+  const confidenceFloorWindow = config.confidenceFloorWindow ?? 2;
+  const memory = config.memory ?? inMemoryReviewStore();
+  const fallbackInstruction = config.fallbackInstruction ?? DEFAULT_FALLBACK_INSTRUCTION2;
+  const failureClassFromVerification = config.failureClassFromVerification ?? controlFailureClassFromVerification;
+  let lowConfidenceStreak = 0;
+  let current = {
+    shot: 0,
+    state: config.initialState,
+    priorReview: null,
+    verification: { pass: false },
+    memory: await memory.load(),
+    completed: false,
+    reviewAvailable: false
+  };
+  return runAgentControlLoop({
+    intent: config.goal,
+    budget: { maxSteps: maxShots, maxWallMs: config.maxWallMs },
+    store: config.store,
+    scenarioId: config.scenarioId ?? "propose-review-control",
+    projectId: config.projectId,
+    variantId: config.variantId,
+    actionFailure: config.actionFailure ?? "stop",
+    observe: () => current,
+    validate: ({ state }) => [
+      objectiveEval({
+        id: "verification",
+        passed: state.verification.pass,
+        score: state.verification.score,
+        severity: "critical",
+        detail: state.verification.pass ? "verification passed" : `verification failed${state.verification.failingLayers?.length ? `: ${state.verification.failingLayers.join(", ")}` : ""}`
+      })
+    ],
+    shouldStop: ({ state }) => {
+      if (state.verification.pass) {
+        return { stop: true, pass: true, reason: "verification passed", score: state.verification.score };
+      }
+      if (state.completed) {
+        return {
+          stop: true,
+          pass: false,
+          reason: "reviewer stopped continuation",
+          score: state.verification.score,
+          failureClass: failureClassFromVerification(state.verification)
+        };
+      }
+      return { stop: false, pass: false, reason: "verification still failing", score: state.verification.score };
+    },
+    decide: ({ state }) => ({
+      type: "continue",
+      action: { type: "propose-review-shot", shot: state.shot + 1 },
+      reason: state.priorReview?.nextShotInstruction ?? fallbackInstruction
+    }),
+    act: async (action, ctx) => {
+      const shot = action.shot;
+      const proposeOut = await config.propose({
+        shot,
+        goal: config.goal,
+        state: current.state,
+        priorReview: current.priorReview,
+        abortSignal: ctx.abortSignal,
+        emitter: ctx.emitter
+      });
+      const nextState = proposeOut.state;
+      const verification = await config.verify(nextState);
+      let review = null;
+      let reviewAvailable = false;
+      let reviewError;
+      let shouldContinue = !verification.pass;
+      if (!verification.pass) {
+        try {
+          review = await config.review({
+            shot,
+            goal: config.goal,
+            state: nextState,
+            verification,
+            traceSummary: proposeOut.traceSummary,
+            memory: await memory.load()
+          });
+          reviewAvailable = true;
+          shouldContinue = review.shouldContinue;
+          lowConfidenceStreak = review.confidence <= confidenceFloor ? lowConfidenceStreak + 1 : 0;
+          if (confidenceFloorWindow > 0 && lowConfidenceStreak >= confidenceFloorWindow) shouldContinue = false;
+        } catch (err) {
+          reviewError = err instanceof Error ? err.message : String(err);
+          review = current.priorReview ?? {
+            observations: "Reviewer unavailable.",
+            diagnosis: reviewError,
+            nextShotInstruction: fallbackInstruction,
+            shouldContinue: true,
+            confidence: 0
+          };
+          shouldContinue = true;
+        }
+      } else {
+        review = {
+          observations: "Verification passed.",
+          diagnosis: "No further revision needed.",
+          nextShotInstruction: "",
+          shouldContinue: false,
+          confidence: 1
+        };
+      }
+      const entry = {
+        ...review ?? {
+          observations: "No review.",
+          diagnosis: "",
+          nextShotInstruction: fallbackInstruction,
+          shouldContinue,
+          confidence: 0
+        },
+        shot,
+        timestamp: Date.now(),
+        verification: {
+          pass: verification.pass,
+          score: verification.score,
+          failingLayers: verification.failingLayers
+        }
+      };
+      await memory.append(entry);
+      current = {
+        shot,
+        state: nextState,
+        priorReview: review,
+        verification,
+        traceSummary: proposeOut.traceSummary,
+        memory: await memory.load(),
+        completed: verification.pass || !shouldContinue,
+        reviewAvailable,
+        reviewError
+      };
+      return {
+        state: nextState,
+        verification,
+        traceSummary: proposeOut.traceSummary,
+        review,
+        reviewAvailable,
+        reviewError
+      };
+    }
+  });
+}
+function controlFailureClassFromVerification(verification) {
+  if (verification.pass) return void 0;
+  return verification.failingLayers?.length ? "instruction_following" : "unknown";
+}
 // src/trace/schema.ts
 var TRACE_SCHEMA_VERSION = "1.0.0";
 var FAILURE_CLASSES = [
@@ -5210,7 +6382,7 @@ function assertNonNegative(n, name) {
 // src/muffled-gate-scanner.ts
 import { readFileSync as readFileSync2, existsSync as existsSync2, readdirSync, statSync } from "fs";
-import { join } from "path";
+import { join as join2 } from "path";
 function codeOf(line) {
   return line.replace(/\/\/.*$/, "").replace(/^\s*\*.*$/, "");
 }
@@ -5314,11 +6486,11 @@ var UNIVERSAL_FINDERS = [
 function autoDeriveImporters(repoRoot, roots, extensions, importsContain) {
   const matches2 = [];
   const walk = (rel) => {
-    const abs = join(repoRoot, rel);
+    const abs = join2(repoRoot, rel);
     if (!existsSync2(abs)) return;
     for (const entry of readdirSync(abs)) {
-      const sub = join(rel, entry);
-      const subAbs = join(repoRoot, sub);
+      const sub = join2(rel, entry);
+      const subAbs = join2(repoRoot, sub);
       let st;
       try {
         st = statSync(subAbs);
@@ -5347,7 +6519,7 @@ function scanForMuffledGates(opts) {
   const findings = [];
   const scanned = /* @__PURE__ */ new Set();
   for (const file of opts.scanFiles) {
-    const abs = join(opts.repoRoot, file);
+    const abs = join2(opts.repoRoot, file);
     if (!existsSync2(abs)) continue;
     const text = readFileSync2(abs, "utf8");
     for (const find of opts.finders) findings.push(...find(file, text));
@@ -5362,7 +6534,7 @@ function scanForMuffledGates(opts) {
     );
     for (const file of importers) {
       if (scanned.has(file)) continue;
-      const abs = join(opts.repoRoot, file);
+      const abs = join2(opts.repoRoot, file);
       if (!existsSync2(abs)) continue;
       const text = readFileSync2(abs, "utf8");
       for (const find of opts.autoDerive.universalFinders) findings.push(...find(file, text));
@@ -5557,7 +6729,7 @@ var Dataset = class _Dataset {
    * Write to disk for contamination-verifiable archives.
    */
   toJsonl() {
-    return this.scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map((s) => JSON.stringify(canonicalize(s))).join("\n") + "\n";
+    return this.scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map((s) => JSON.stringify(canonicalize2(s))).join("\n") + "\n";
   }
   static fromJsonl(jsonl, manifest) {
     const scenarios = [];
@@ -5570,18 +6742,18 @@ var Dataset = class _Dataset {
   }
 };
 async function hashScenarios(scenarios) {
-  const canonical = scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map(canonicalize);
+  const canonical = scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map(canonicalize2);
   const text = JSON.stringify(canonical);
   const bytes = new TextEncoder().encode(text);
   const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
   return Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
 }
-function canonicalize(v) {
+function canonicalize2(v) {
   if (v === null || typeof v !== "object") return v;
-  if (Array.isArray(v)) return v.map(canonicalize);
+  if (Array.isArray(v)) return v.map(canonicalize2);
   const keys = Object.keys(v).sort();
   const out = {};
-  for (const k of keys) out[k] = canonicalize(v[k]);
+  for (const k of keys) out[k] = canonicalize2(v[k]);
   return out;
 }
 function seededShuffle(items, seed) {
@@ -7350,7 +8522,7 @@ async function commitBisect(options) {
 }
 async function promptBisect(options) {
   const split = options.paragraphSplitter ?? ((p) => p.split(/\n\s*\n/));
-  const join3 = (paragraphs) => paragraphs.join("\n\n");
+  const join4 = (paragraphs) => paragraphs.join("\n\n");
   const goodParas = split(options.good);
   const badParas = split(options.bad);
   if (goodParas.length !== badParas.length) {
@@ -7368,7 +8540,7 @@ async function promptBisect(options) {
   const result = await bisect({
     good: goodMask,
     bad: badMask,
-    runEval: (mask) => options.runEval(join3(paragraphsFor(mask))),
+    runEval: (mask) => options.runEval(join4(paragraphsFor(mask))),
     maxIterations: options.maxIterations ?? n + 5,
     halfway: (g, b) => {
       for (let i = 0; i < g.length; i++) {
@@ -7399,12 +8571,12 @@ async function promptBisect(options) {
     }
   }
   const materializedPath = result.path.map((s) => ({
-    state: join3(paragraphsFor(s.state)),
+    state: join4(paragraphsFor(s.state)),
     score: s.score,
     pass: s.pass
   }));
   return {
-    culprit: join3(paragraphsFor(culprit)),
+    culprit: join4(paragraphsFor(culprit)),
     path: materializedPath,
     converged: result.converged,
     inputInconsistent: result.inputInconsistent,
@@ -7615,7 +8787,7 @@ function attributeStep(op, prmA, prmB) {
 // src/pre-registration.ts
 async function signManifest(m) {
-  const canonical = canonicalize2(m);
+  const canonical = canonicalize3(m);
   const bytes = new TextEncoder().encode(JSON.stringify(canonical));
   const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
   const hash = Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
@@ -7645,12 +8817,12 @@ async function evaluateHypothesis(manifest, observed) {
     rejectionReasons: reasons
   };
 }
-function canonicalize2(v) {
+function canonicalize3(v) {
   if (v === null || typeof v !== "object") return v;
-  if (Array.isArray(v)) return v.map(canonicalize2);
+  if (Array.isArray(v)) return v.map(canonicalize3);
   const keys = Object.keys(v).sort();
   const out = {};
-  for (const k of keys) out[k] = canonicalize2(v[k]);
+  for (const k of keys) out[k] = canonicalize3(v[k]);
   return out;
 }
@@ -8459,7 +9631,7 @@ function mergeSignals(a, b) {
 // src/command-runner.ts
 import { spawnSync } from "child_process";
 import { existsSync as existsSync3, readFileSync as readFileSync3, readdirSync as readdirSync2, statSync as statSync2 } from "fs";
-import { join as join2 } from "path";
+import { join as join3 } from "path";
 var localCommandRunner = {
   name: "local",
   async run(input) {
@@ -8506,7 +9678,7 @@ var localCommandRunner = {
     const out = [];
     for (const name of entries) {
       try {
-        const st = statSync2(join2(path, name));
+        const st = statSync2(join3(path, name));
         out.push({
           name,
           isDirectory: st.isDirectory(),
@@ -12298,6 +13470,46 @@ function truncate3(s, max) {
 function quote(s) {
   return s.replace(/`/g, "\\`");
 }
+function autoCloseTruncatedJson(raw) {
+  const stack = [];
+  let inString = false;
+  let escape = false;
+  for (const c of raw) {
+    if (escape) {
+      escape = false;
+      continue;
+    }
+    if (inString) {
+      if (c === "\\") {
+        escape = true;
+        continue;
+      }
+      if (c === '"') {
+        inString = false;
+        continue;
+      }
+      continue;
+    }
+    if (c === '"') {
+      inString = true;
+      continue;
+    }
+    if (c === "{" || c === "[") stack.push(c);
+    else if (c === "}") {
+      if (stack.pop() !== "{") return null;
+    } else if (c === "]") {
+      if (stack.pop() !== "[") return null;
+    }
+  }
+  if (stack.length === 0 && !inString) return raw;
+  let suffix = "";
+  if (inString) suffix += '"';
+  while (stack.length > 0) {
+    const opener = stack.pop();
+    suffix += opener === "{" ? "}" : "]";
+  }
+  return raw + suffix;
+}
 function parseReflectionResponse(raw, maxProposals) {
   let text = raw.trim();
   if (text.startsWith("```")) text = text.replace(/^```(?:json)?\n?/, "").replace(/\n?```$/, "");
@@ -12322,6 +13534,18 @@ function parseReflectionResponse(raw, maxProposals) {
     } catch {
     }
   }
+  if (parsed == null) {
+    for (const slice of candidates) {
+      const closed = autoCloseTruncatedJson(slice);
+      if (closed != null && closed !== slice) {
+        try {
+          parsed = JSON.parse(closed);
+          break;
+        } catch {
+        }
+      }
+    }
+  }
   if (parsed == null) return [];
   let proposalsRaw;
   if (Array.isArray(parsed)) {
@@ -12374,6 +13598,7 @@ export {
   ExperimentTracker,
   FAILURE_CLASSES,
   FileSystemExperimentStore,
+  FileSystemFeedbackTrajectoryStore,
   FileSystemOutcomeStore,
   FileSystemTraceStore,
   HeldOutGate,
@@ -12381,6 +13606,7 @@ export {
   HoldoutLockedError,
   INTENT_MATCH_JUDGE_VERSION,
   InMemoryExperimentStore,
+  InMemoryFeedbackTrajectoryStore,
   InMemoryOutcomeStore,
   InMemoryTraceStore,
   InMemoryTrialCache,
@@ -12420,9 +13646,11 @@ export {
   adversarialJudge,
   aggregateLlm,
   aggregateRunScore,
+  allCriticalPassed,
   analyzeAntiSlop,
   analyzeSeries,
   argHash,
+  assignFeedbackSplit,
   attributeCounterfactuals,
   deterministicSplit as benchmarkDeterministicSplit,
   benchmarks_exports as benchmarks,
@@ -12460,6 +13688,8 @@ export {
   computeToolUseMetrics,
   confidenceInterval,
   containsAll,
+  controlFailureClassFromVerification,
+  controlRunToFeedbackTrajectory,
   correlateLayers,
   correlationStudy,
   createAntiSlopJudge,
@@ -12467,6 +13697,7 @@ export {
   createCustomJudge,
   createDefaultReviewer,
   createDomainExpertJudge,
+  createFeedbackTrajectory,
   createIntentMatchJudge,
   createLlmReviewer,
   createSandboxCodeMutator,
@@ -12495,6 +13726,10 @@ export {
   extractAssetUrls,
   extractErrorCount,
   failureClusterView,
+  feedbackTrajectoriesToDatasetScenarios,
+  feedbackTrajectoriesToOptimizerRows,
+  feedbackTrajectoryToDatasetScenario,
+  feedbackTrajectoryToOptimizerRow,
   fileContains,
   fileExists,
   findAutoMatchNoExpectation,
@@ -12549,6 +13784,7 @@ export {
   nonRefusalRubric,
   normalizeScores,
   notBlocked,
+  objectiveEval,
   outputLengthRubric,
   pairedBootstrap,
   pairedTTest,
@@ -12557,6 +13793,7 @@ export {
   paretoChart,
   paretoFrontier,
   paretoFrontierWithCrowding,
+  parseFeedbackTrajectoriesJsonl,
   parseReflectionResponse,
   parseRunRecordSafe,
   partialCredit,
@@ -12583,6 +13820,7 @@ export {
   renderMarkdown,
   renderMarkdownReport,
   renderPlaybookMarkdown,
+  renderPreferenceMemoryMarkdown,
   renderSteeringText,
   replayScorerOverCorpus,
   replayTraceThroughJudge,
@@ -12592,6 +13830,7 @@ export {
   roundTripRunRecord,
   rowCount,
   rowWhere,
+  runAgentControlLoop,
   runAssertions,
   runCanaries,
   runCounterfactual,
@@ -12605,6 +13844,7 @@ export {
   runKeywordCoverageJudgeUrl,
   runPromptEvolution,
   runProposeReview,
+  runProposeReviewAsControlLoop,
   runReferenceReplay,
   runSelfPlay,
   runSemanticConceptJudge,
@@ -12621,13 +13861,18 @@ export {
   selectHarnessVariant,
   selfPreference,
   sentenceReorderMutator,
+  serializeFeedbackTrajectoriesJsonl,
   signManifest,
   soc2Report,
   statusAdvanced,
+  stopOnNoProgress,
+  stopOnRepeatedAction,
   stripFencedJson,
   stuckLoopView,
+  subjectiveEval,
   summarize,
   summarizeHarnessResults,
+  summarizePreferenceMemory,
   summaryTable,
   testJudge,
   textInSnapshot,
@@ -12653,6 +13898,7 @@ export {
   welchsTTest,
   whitespaceCollapseMutator,
   wilcoxonSignedRank,
+  withAssignedFeedbackSplit,
   wranglerDeployRunner
 };
 //# sourceMappingURL=index.js.map