brooder 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
brooder/__init__.py ADDED
@@ -0,0 +1,31 @@
1
+ """Brooder — snapshot testing for AI agents.
2
+
3
+ Public API:
4
+ brooder.record(name) decorator that captures an agent's runs
5
+ brooder.tool_call(...) log a tool call / step into the active run
6
+ brooder.instrument(client) auto-capture an LLM client's tool calls
7
+ brooder.claude_agent_hooks() hooks mapping for the Claude Agent SDK
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from importlib.metadata import PackageNotFoundError
13
+ from importlib.metadata import version as _pkg_version
14
+
15
+ from .integrations import instrument
16
+ from .integrations.claude_agent import claude_agent_hooks
17
+ from .recorder import record, tool_call, turn
18
+
19
+ try:
20
+ __version__ = _pkg_version("brooder")
21
+ except PackageNotFoundError: # pragma: no cover - only when running from a non-installed tree
22
+ __version__ = "0.0.0+unknown"
23
+
24
+ __all__ = [
25
+ "__version__",
26
+ "claude_agent_hooks",
27
+ "instrument",
28
+ "record",
29
+ "tool_call",
30
+ "turn",
31
+ ]
brooder/analysis.py ADDED
@@ -0,0 +1,79 @@
1
+ """Turn captured runs into verdicts.
2
+
3
+ Separated from capture (``recorder``) so we can group multiple runs of the same case and
4
+ detect flakiness. For each case:
5
+
6
+ - more than one run that disagree with each other -> FLAKY
7
+ - otherwise, compare the representative run to its baseline -> PASS / REGRESSED / NEW
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from pathlib import Path
13
+ from typing import Optional
14
+
15
+ from . import storage
16
+ from .diffing import compare
17
+ from .judges import ExactJudge, Judge
18
+ from .models import Change, Diff, Run, Verdict
19
+
20
+ _FLAKY_PENALTY = 40
21
+
22
+
23
+ def _runs_agree(runs: list[Run], judge: Judge, observe_results: bool) -> bool:
24
+ first = runs[0]
25
+ return all(
26
+ not compare(first, other, judge, observe_results=observe_results).changes
27
+ for other in runs[1:]
28
+ )
29
+
30
+
31
+ def analyze(
32
+ runs: list[Run],
33
+ base: Optional[Path] = None,
34
+ judge: Optional[Judge] = None,
35
+ observe_results: bool = False,
36
+ ) -> list[Diff]:
37
+ """Group captured runs by case and produce one diff per case.
38
+
39
+ A case whose repeated runs disagree with each other is reported as ``FLAKY``; otherwise
40
+ the representative run is compared to its baseline.
41
+
42
+ Args:
43
+ runs: The captured runs (possibly several per case when using ``--runs``).
44
+ base: Project root; defaults to the current working directory.
45
+ judge: Output-equivalence judge; defaults to :class:`~brooder.judges.ExactJudge`.
46
+ observe_results: If ``True``, also diff tool observations through the judge.
47
+
48
+ Returns:
49
+ One :class:`~brooder.models.Diff` per distinct case, in first-seen order.
50
+ """
51
+ judge = judge or ExactJudge()
52
+
53
+ groups: dict[tuple[str, str], list[Run]] = {}
54
+ for run in runs:
55
+ groups.setdefault((run.agent, run.case_id), []).append(run)
56
+
57
+ diffs: list[Diff] = []
58
+ for (agent, case_id), case_runs in groups.items():
59
+ if len(case_runs) > 1 and not _runs_agree(case_runs, judge, observe_results):
60
+ diffs.append(
61
+ Diff(
62
+ agent=agent,
63
+ case_id=case_id,
64
+ verdict=Verdict.FLAKY,
65
+ changes=[
66
+ Change(
67
+ path="run",
68
+ kind="changed",
69
+ before="deterministic",
70
+ after=f"{len(case_runs)} runs diverge",
71
+ )
72
+ ],
73
+ stability=max(0, 100 - _FLAKY_PENALTY),
74
+ )
75
+ )
76
+ else:
77
+ baseline = storage.load_baseline(agent, case_id, base)
78
+ diffs.append(compare(baseline, case_runs[0], judge, observe_results=observe_results))
79
+ return diffs
brooder/cli.py ADDED
@@ -0,0 +1,281 @@
1
+ """The ``brooder`` command-line interface."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import runpy
7
+ import sys
8
+ from enum import Enum
9
+ from pathlib import Path
10
+ from typing import Optional
11
+
12
+ import typer
13
+ from rich.panel import Panel
14
+
15
+ from . import __version__, metrics, recorder, storage
16
+ from .analysis import analyze
17
+ from .config import DEFAULT_CONFIG_YAML, BrooderConfig, load_config
18
+ from .errors import BrooderError, ScriptNotFoundError
19
+ from .judges import make_judge
20
+ from .log import setup_logging
21
+ from .models import Diff, Run
22
+ from .report import (
23
+ console,
24
+ print_diff_detail,
25
+ print_summary,
26
+ render_json,
27
+ render_markdown,
28
+ )
29
+
30
+
31
+ class OutputFormat(str, Enum):
32
+ """How results are rendered to stdout."""
33
+
34
+ table = "table"
35
+ json = "json"
36
+ markdown = "markdown"
37
+
38
+
39
+ app = typer.Typer(
40
+ add_completion=False,
41
+ no_args_is_help=True,
42
+ help="Snapshot testing for AI agents — catch behavior regressions before they ship.",
43
+ )
44
+
45
+
46
+ @app.callback(invoke_without_command=True)
47
+ def _root(
48
+ verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
49
+ version: bool = typer.Option(False, "--version", help="Show version and exit."),
50
+ ) -> None:
51
+ setup_logging(verbose)
52
+ if version:
53
+ console.print(f"brooder {__version__}")
54
+ raise typer.Exit()
55
+
56
+
57
+ def _exec_script(
58
+ script: str,
59
+ mode: str,
60
+ model: Optional[str] = None,
61
+ times: int = 1,
62
+ max_steps: int = 0,
63
+ ) -> list[Run]:
64
+ """Execute a user script under Brooder ``times`` times; return the captured runs."""
65
+ path = Path(script)
66
+ if not path.exists():
67
+ raise ScriptNotFoundError(f"script not found: {script}")
68
+
69
+ recorder.reset_session()
70
+ os.environ["BROODER_MODE"] = mode
71
+ os.environ["BROODER_MAX_STEPS"] = str(max_steps)
72
+ if model:
73
+ os.environ["BROODER_MODEL"] = model
74
+ else:
75
+ os.environ.pop("BROODER_MODEL", None)
76
+
77
+ saved_argv = sys.argv
78
+ sys.argv = [str(path)]
79
+ try:
80
+ for index in range(max(1, times)):
81
+ os.environ["BROODER_RUN_INDEX"] = str(index)
82
+ runpy.run_path(str(path), run_name="__main__")
83
+ finally:
84
+ sys.argv = saved_argv
85
+ os.environ.pop("BROODER_MODE", None)
86
+ os.environ.pop("BROODER_RUN_INDEX", None)
87
+ os.environ.pop("BROODER_MAX_STEPS", None)
88
+
89
+ return list(recorder.SESSION)
90
+
91
+
92
+ def _analyze(runs: list[Run], cfg: BrooderConfig) -> list[Diff]:
93
+ return analyze(
94
+ runs,
95
+ judge=make_judge(cfg.judge),
96
+ observe_results=cfg.trajectory.observations,
97
+ )
98
+
99
+
100
+ def _regressions(diffs: list[Diff]) -> list[Diff]:
101
+ return [diff for diff in diffs if not diff.ok]
102
+
103
+
104
+ def _resolve_format(output_format: OutputFormat, json_flag: bool) -> OutputFormat:
105
+ """``--json`` is a convenience alias for ``--format json`` and wins if both are given."""
106
+ return OutputFormat.json if json_flag else output_format
107
+
108
+
109
+ def _emit(diffs: list[Diff], fmt: OutputFormat) -> None:
110
+ """Render results to stdout in the requested format.
111
+
112
+ ``table`` prints the human summary plus a detail panel per regression; ``json`` / ``markdown``
113
+ print *only* the serialized document so stdout stays machine-parseable.
114
+ """
115
+ if fmt is OutputFormat.json:
116
+ print(render_json(diffs))
117
+ elif fmt is OutputFormat.markdown:
118
+ print(render_markdown(diffs))
119
+ else:
120
+ print_summary(diffs)
121
+ for diff in _regressions(diffs):
122
+ print_diff_detail(diff)
123
+
124
+
125
+ def _emit_metrics(diffs: list[Diff], cfg: BrooderConfig) -> None:
126
+ """Emit OTLP metrics if an endpoint is configured (config or env); best-effort, never raises."""
127
+ metrics.emit(diffs, endpoint=cfg.metrics.otlp_endpoint)
128
+
129
+
130
+ def _status(message: str, fmt: OutputFormat) -> None:
131
+ """Print human status chatter only in table mode, so json/markdown stdout stays clean."""
132
+ if fmt is OutputFormat.table:
133
+ console.print(message)
134
+
135
+
136
+ @app.command()
137
+ def init() -> None:
138
+ """Create brooder.yaml and the local .brooder/ directory."""
139
+ config_file = Path("brooder.yaml")
140
+ if not config_file.exists():
141
+ config_file.write_text(DEFAULT_CONFIG_YAML, encoding="utf-8")
142
+ console.print("[green]Created brooder.yaml[/]")
143
+ else:
144
+ console.print("brooder.yaml already exists")
145
+ (Path(".brooder") / "baselines").mkdir(parents=True, exist_ok=True)
146
+ console.print("Ready. Record baselines with: [bold]brooder record <script>[/]")
147
+
148
+
149
+ @app.command()
150
+ def record(script: str) -> None:
151
+ """Record an agent's real runs as golden baselines."""
152
+ cfg = load_config()
153
+ captured = _exec_script(script, "record", max_steps=cfg.trajectory.max_steps)
154
+ console.print(f"[green]Recorded {len(captured)} baseline(s).[/]")
155
+
156
+
157
+ @app.command()
158
+ def run(
159
+ script: str,
160
+ model: Optional[str] = typer.Option(None, "--model", help="Label this run's model."),
161
+ runs: Optional[int] = typer.Option(None, "--runs", help="Repeat each case N times."),
162
+ output_format: OutputFormat = typer.Option(
163
+ OutputFormat.table, "--format", help="Output format: table | json | markdown."
164
+ ),
165
+ json_out: bool = typer.Option(False, "--json", help="Shortcut for --format json."),
166
+ ) -> None:
167
+ """Re-run an agent and diff its behavior against the baselines."""
168
+ cfg = load_config()
169
+ fmt = _resolve_format(output_format, json_out)
170
+ times = runs if runs is not None else cfg.runs
171
+ captured = _exec_script(script, "run", model, times, cfg.trajectory.max_steps)
172
+ diffs = _analyze(captured, cfg)
173
+ _emit(diffs, fmt)
174
+ _emit_metrics(diffs, cfg)
175
+ regressions = _regressions(diffs)
176
+ if regressions:
177
+ _status(f"[red]{len(regressions)} issue(s) detected.[/]", fmt)
178
+ raise typer.Exit(1)
179
+ _status("[green]No regressions.[/]", fmt)
180
+
181
+
182
+ @app.command()
183
+ def diff(
184
+ output_format: OutputFormat = typer.Option(
185
+ OutputFormat.table, "--format", help="Output format: table | json | markdown."
186
+ ),
187
+ json_out: bool = typer.Option(False, "--json", help="Shortcut for --format json."),
188
+ ) -> None:
189
+ """Show the detailed behavioral diff for the latest runs."""
190
+ fmt = _resolve_format(output_format, json_out)
191
+ runs = list(storage.iter_runs())
192
+ if not runs:
193
+ _status("No runs yet. Run [bold]brooder run <script>[/] first.", fmt)
194
+ raise typer.Exit()
195
+ cfg = load_config()
196
+ diffs = list(analyze(runs, observe_results=cfg.trajectory.observations))
197
+ if fmt is OutputFormat.json:
198
+ print(render_json(diffs))
199
+ elif fmt is OutputFormat.markdown:
200
+ print(render_markdown(diffs))
201
+ else:
202
+ for stored in diffs:
203
+ print_diff_detail(stored)
204
+
205
+
206
+ @app.command()
207
+ def approve() -> None:
208
+ """Accept the latest runs as the new baselines (like `jest -u`)."""
209
+ count = storage.promote_runs_to_baselines()
210
+ console.print(f"[green]Promoted {count} run(s) to baselines.[/]")
211
+
212
+
213
+ @app.command()
214
+ def ci(
215
+ script: str,
216
+ output_format: OutputFormat = typer.Option(
217
+ OutputFormat.table, "--format", help="Output format: table | json | markdown."
218
+ ),
219
+ json_out: bool = typer.Option(False, "--json", help="Shortcut for --format json."),
220
+ ) -> None:
221
+ """CI mode: fail the build if regressions exceed the configured threshold."""
222
+ cfg = load_config()
223
+ fmt = _resolve_format(output_format, json_out)
224
+ captured = _exec_script(script, "run", times=cfg.runs, max_steps=cfg.trajectory.max_steps)
225
+ diffs = _analyze(captured, cfg)
226
+ _emit(diffs, fmt)
227
+ _emit_metrics(diffs, cfg)
228
+ regressions = _regressions(diffs)
229
+ if len(regressions) > cfg.regression_threshold:
230
+ _status(
231
+ f"[red]{len(regressions)} issue(s) > threshold {cfg.regression_threshold}. Failing.[/]",
232
+ fmt,
233
+ )
234
+ raise typer.Exit(1)
235
+ _status("[green]Within threshold. Passing.[/]", fmt)
236
+
237
+
238
+ @app.command()
239
+ def migrate(
240
+ script: str,
241
+ from_model: str = typer.Option(..., "--from", help="The model you're migrating FROM."),
242
+ to_model: str = typer.Option(..., "--to", help="The model you're migrating TO."),
243
+ ) -> None:
244
+ """Model Migration Report: what behavior changes if you switch models?"""
245
+ cfg = load_config()
246
+ max_steps = cfg.trajectory.max_steps
247
+ console.rule(f"Recording baseline on [cyan]{from_model}[/]")
248
+ _exec_script(script, "record", from_model, max_steps=max_steps)
249
+
250
+ console.rule(f"Re-running on [cyan]{to_model}[/]")
251
+ diffs = _analyze(_exec_script(script, "run", to_model, cfg.runs, max_steps), cfg)
252
+ print_summary(diffs)
253
+
254
+ changed = _regressions(diffs)
255
+ border = "red" if changed else "green"
256
+ console.print(
257
+ Panel(
258
+ f"[bold]{len(changed)} of {len(diffs)}[/] cases change behavior when migrating "
259
+ f"[cyan]{from_model}[/] → [cyan]{to_model}[/].",
260
+ title="Model Migration Report",
261
+ border_style=border,
262
+ )
263
+ )
264
+ for diff in changed:
265
+ print_diff_detail(diff)
266
+
267
+ if changed:
268
+ raise typer.Exit(1)
269
+
270
+
271
+ def main() -> None:
272
+ """Entry point with a single error boundary for user-facing errors."""
273
+ try:
274
+ app()
275
+ except BrooderError as exc:
276
+ console.print(f"[red]error:[/] {exc}")
277
+ raise SystemExit(2) from exc
278
+
279
+
280
+ if __name__ == "__main__":
281
+ main()
brooder/config.py ADDED
@@ -0,0 +1,88 @@
1
+ """Typed configuration loaded from an optional ``brooder.yaml``."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Literal, Optional
7
+
8
+ import yaml
9
+ from pydantic import BaseModel, ConfigDict, Field, ValidationError
10
+
11
+ from .errors import ConfigError
12
+
13
+ CONFIG_FILE = "brooder.yaml"
14
+
15
+
16
+ class TrajectoryConfig(BaseModel):
17
+ """Trajectory-specific diff and guardrail settings."""
18
+
19
+ model_config = ConfigDict(extra="forbid")
20
+
21
+ observations: bool = Field(
22
+ default=False, description="Also diff tool observations (results) via the judge."
23
+ )
24
+ max_steps: int = Field(
25
+ default=0, ge=0, description="0 = unlimited; >0 aborts and flags runaway loops."
26
+ )
27
+
28
+
29
+ class MetricsConfig(BaseModel):
30
+ """Optional machine-readable metric emission settings."""
31
+
32
+ model_config = ConfigDict(extra="forbid")
33
+
34
+ otlp_endpoint: Optional[str] = Field(
35
+ default=None,
36
+ description="OTLP endpoint for metric emission; also honors OTEL_EXPORTER_OTLP_ENDPOINT.",
37
+ )
38
+
39
+
40
+ class BrooderConfig(BaseModel):
41
+ """Validated project configuration."""
42
+
43
+ model_config = ConfigDict(extra="forbid")
44
+
45
+ judge: Literal["exact", "llm"] = "exact"
46
+ regression_threshold: int = Field(default=0, ge=0)
47
+ runs: int = Field(default=1, ge=1)
48
+ trajectory: TrajectoryConfig = Field(default_factory=TrajectoryConfig)
49
+ metrics: MetricsConfig = Field(default_factory=MetricsConfig)
50
+
51
+
52
+ def load_config(base: Optional[Path] = None) -> BrooderConfig:
53
+ """Load and validate ``brooder.yaml``.
54
+
55
+ Args:
56
+ base: Project root; defaults to the current working directory.
57
+
58
+ Returns:
59
+ The validated configuration, or defaults if the file is absent.
60
+
61
+ Raises:
62
+ ConfigError: If the file exists but contains invalid YAML or invalid values.
63
+ """
64
+ path = (base or Path.cwd()) / CONFIG_FILE
65
+ if not path.exists():
66
+ return BrooderConfig()
67
+ try:
68
+ data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
69
+ except yaml.YAMLError as exc:
70
+ raise ConfigError(f"invalid YAML in {path}: {exc}") from exc
71
+ try:
72
+ return BrooderConfig.model_validate(data)
73
+ except ValidationError as exc:
74
+ raise ConfigError(f"invalid config in {path}: {exc}") from exc
75
+
76
+
77
+ DEFAULT_CONFIG_YAML = """# Brooder configuration
78
+ judge: exact # exact | llm (llm-judge is on the roadmap)
79
+ regression_threshold: 0 # max regressed cases allowed before `brooder ci` fails
80
+ runs: 1 # executions per case (flakiness scoring is on the roadmap)
81
+
82
+ trajectory:
83
+ observations: false # also diff tool results (via the judge); off by default (noisy)
84
+ max_steps: 0 # 0 = unlimited; >0 aborts a run and flags runaway loops
85
+
86
+ # metrics: # optional OTLP metric emission (needs the `otel` extra)
87
+ # otlp_endpoint: http://localhost:4318/v1/metrics # or set OTEL_EXPORTER_OTLP_ENDPOINT
88
+ """
brooder/diffing.py ADDED
@@ -0,0 +1,217 @@
1
+ """The behavioral diff engine — the core of Brooder.
2
+
3
+ Compares a fresh run to its golden baseline by aligning their **trajectories**. Each step is
4
+ reduced to a signature ``(kind, name, args)`` and the two step sequences are aligned with an LCS
5
+ diff, so an inserted, dropped, or reordered step is reported at the position where the path
6
+ *diverged* — it doesn't cascade into every later step. Turn- and step-counts are emitted as
7
+ aggregate signals. The final *output* is compared through a :class:`~brooder.judges.Judge` so
8
+ semantically-equivalent wording is not a regression; tool observations are compared separately and
9
+ more leniently (see ``design/trajectory.md``). Flakiness scoring lives in ``analysis``.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import difflib
15
+ import json
16
+ from typing import Any, Optional
17
+
18
+ from .judges import ExactJudge, Judge
19
+ from .models import Change, Diff, Run, Step, StepKind, Verdict
20
+
21
+ _CHANGE_PENALTY = 20 # stability points lost per behavioral change
22
+
23
+
24
+ def compare(
25
+ baseline: Optional[Run],
26
+ current: Run,
27
+ judge: Optional[Judge] = None,
28
+ *,
29
+ observe_results: bool = False,
30
+ ) -> Diff:
31
+ """Compare a run to its baseline and produce a behavioral diff.
32
+
33
+ The trajectories are sequence-aligned (LCS) so added, dropped, or reordered steps are reported
34
+ where the path diverges. The final output is compared through ``judge`` so that
35
+ semantically-equivalent wording is not reported as a regression. Guardrail terminals
36
+ (``runaway`` / ``gave up``) are surfaced as distinct signals.
37
+
38
+ Args:
39
+ baseline: The golden run, or ``None`` if the case has no baseline yet.
40
+ current: The run under test.
41
+ judge: Output-equivalence judge; defaults to :class:`~brooder.judges.ExactJudge`.
42
+ observe_results: If ``True``, also diff aligned tool observations (results) through
43
+ ``judge``. Off by default because tool output is often noisy.
44
+
45
+ Returns:
46
+ A :class:`~brooder.models.Diff` with a verdict of ``NEW`` (no baseline),
47
+ ``PASS`` (no changes), or ``REGRESSED`` (changes found).
48
+ """
49
+ judge = judge or ExactJudge()
50
+
51
+ if baseline is None:
52
+ return Diff(
53
+ agent=current.agent,
54
+ case_id=current.case_id,
55
+ verdict=Verdict.NEW,
56
+ stability=100,
57
+ )
58
+
59
+ changes: list[Change] = []
60
+ _diff_trajectory(baseline, current, changes, judge, observe_results)
61
+
62
+ if not judge.equivalent(baseline.output, current.output):
63
+ changes.append(
64
+ Change(path="output", kind="changed", before=baseline.output, after=current.output)
65
+ )
66
+
67
+ verdict = Verdict.PASS if not changes else Verdict.REGRESSED
68
+ stability = 100 if not changes else max(0, 100 - len(changes) * _CHANGE_PENALTY)
69
+ return Diff(
70
+ agent=current.agent,
71
+ case_id=current.case_id,
72
+ verdict=verdict,
73
+ changes=changes,
74
+ stability=stability,
75
+ )
76
+
77
+
78
+ def _step_signature(step: Step) -> tuple[str, str, str]:
79
+ """A stable, hashable identity for a step: its kind, name, and canonical args.
80
+
81
+ Observations (a ``TOOL`` step's ``result``) are deliberately excluded — they're diffed
82
+ separately and more leniently, so noisy tool output doesn't distort path alignment. A ``FINAL``
83
+ step carries the output in ``result``, so output changes never surface here; the output judge
84
+ owns them.
85
+ """
86
+ return (step.kind.value, step.name, json.dumps(step.args, sort_keys=True, default=str))
87
+
88
+
89
+ def _step_summary(step: Step) -> dict[str, Any]:
90
+ """A compact, path-focused view of a step for a :class:`Change` payload (no observation)."""
91
+ return {"kind": step.kind.value, "name": step.name, "args": step.args}
92
+
93
+
94
+ def _diff_trajectory(
95
+ baseline: Run,
96
+ current: Run,
97
+ changes: list[Change],
98
+ judge: Judge,
99
+ observe_results: bool,
100
+ ) -> None:
101
+ """Align the two trajectories with an LCS diff, emitting per-step and aggregate changes."""
102
+ b, c = baseline.trajectory, current.trajectory
103
+ matcher = difflib.SequenceMatcher(
104
+ a=[_step_signature(s) for s in b],
105
+ b=[_step_signature(s) for s in c],
106
+ autojunk=False,
107
+ )
108
+ for tag, i1, i2, j1, j2 in matcher.get_opcodes():
109
+ if tag == "equal":
110
+ if observe_results:
111
+ _diff_observations(b, c, i1, i2, j1, judge, changes)
112
+ continue
113
+ if tag == "delete":
114
+ for i in range(i1, i2):
115
+ changes.append(
116
+ Change(path=f"trajectory[{i}]", kind="removed", before=_step_summary(b[i]))
117
+ )
118
+ elif tag == "insert":
119
+ for j in range(j1, j2):
120
+ changes.append(
121
+ Change(path=f"trajectory[{j}]", kind="added", after=_step_summary(c[j]))
122
+ )
123
+ else: # "replace": pair aligned positions, spill any leftovers as add/remove
124
+ for k in range(max(i2 - i1, j2 - j1)):
125
+ bi, cj = i1 + k, j1 + k
126
+ if bi < i2 and cj < j2:
127
+ changes.append(
128
+ Change(
129
+ path=f"trajectory[{bi}]",
130
+ kind="changed",
131
+ before=_step_summary(b[bi]),
132
+ after=_step_summary(c[cj]),
133
+ )
134
+ )
135
+ elif cj < j2:
136
+ changes.append(
137
+ Change(path=f"trajectory[{cj}]", kind="added", after=_step_summary(c[cj]))
138
+ )
139
+ else:
140
+ changes.append(
141
+ Change(
142
+ path=f"trajectory[{bi}]",
143
+ kind="removed",
144
+ before=_step_summary(b[bi]),
145
+ )
146
+ )
147
+
148
+ _diff_guardrails(baseline, current, changes)
149
+
150
+ if baseline.turns != current.turns:
151
+ changes.append(
152
+ Change(
153
+ path="trajectory.turns", kind="changed", before=baseline.turns, after=current.turns
154
+ )
155
+ )
156
+ if baseline.step_count != current.step_count:
157
+ changes.append(
158
+ Change(
159
+ path="trajectory.steps",
160
+ kind="changed",
161
+ before=baseline.step_count,
162
+ after=current.step_count,
163
+ )
164
+ )
165
+
166
+
167
+ def _diff_guardrails(baseline: Run, current: Run, changes: list[Change]) -> None:
168
+ """Emit a guardrail signal when a run's terminal condition changed (runaway or gave-up).
169
+
170
+ ``runaway`` takes precedence: a runaway run also lacks a ``FINAL`` step, so reporting both would
171
+ be redundant — the runaway signal already explains why the agent produced no answer.
172
+ """
173
+ if baseline.runaway != current.runaway:
174
+ changes.append(
175
+ Change(
176
+ path="trajectory.runaway",
177
+ kind="changed",
178
+ before=baseline.runaway,
179
+ after=current.runaway,
180
+ )
181
+ )
182
+ elif baseline.gave_up != current.gave_up:
183
+ changes.append(
184
+ Change(
185
+ path="trajectory.gave_up",
186
+ kind="changed",
187
+ before=baseline.gave_up,
188
+ after=current.gave_up,
189
+ )
190
+ )
191
+
192
+
193
+ def _diff_observations(
194
+ b: list[Step],
195
+ c: list[Step],
196
+ i1: int,
197
+ i2: int,
198
+ j1: int,
199
+ judge: Judge,
200
+ changes: list[Change],
201
+ ) -> None:
202
+ """Diff the observations of aligned (equal-signature) ``TOOL`` steps through the judge.
203
+
204
+ Only ``TOOL`` steps carry observations worth comparing; a ``FINAL`` step's result is the output
205
+ (already judged), and a ``TURN`` has none. Opt-in via ``trajectory.observations``.
206
+ """
207
+ for offset in range(i2 - i1):
208
+ before, after = b[i1 + offset], c[j1 + offset]
209
+ if before.kind is StepKind.TOOL and not judge.equivalent(before.result, after.result):
210
+ changes.append(
211
+ Change(
212
+ path=f"trajectory[{j1 + offset}].result",
213
+ kind="changed",
214
+ before=before.result,
215
+ after=after.result,
216
+ )
217
+ )