PyPI - brooder - Versions diffs - 0.1.0__py3-none-any.whl - Mend

brooder 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

brooder/__init__.py +31 -0
brooder/analysis.py +79 -0
brooder/cli.py +281 -0
brooder/config.py +88 -0
brooder/diffing.py +217 -0
brooder/errors.py +31 -0
brooder/integrations/__init__.py +75 -0
brooder/integrations/anthropic.py +46 -0
brooder/integrations/base.py +170 -0
brooder/integrations/bedrock.py +49 -0
brooder/integrations/claude_agent.py +164 -0
brooder/integrations/google.py +61 -0
brooder/integrations/langchain.py +321 -0
brooder/integrations/openai.py +43 -0
brooder/integrations/openai_agents.py +208 -0
brooder/integrations/otel.py +216 -0
brooder/judges.py +109 -0
brooder/log.py +33 -0
brooder/metrics.py +116 -0
brooder/models.py +148 -0
brooder/py.typed +1 -0
brooder/recorder.py +342 -0
brooder/report.py +261 -0
brooder/storage.py +150 -0
brooder-0.1.0.dist-info/METADATA +338 -0
brooder-0.1.0.dist-info/RECORD +30 -0
brooder-0.1.0.dist-info/WHEEL +4 -0
brooder-0.1.0.dist-info/entry_points.txt +2 -0
brooder-0.1.0.dist-info/licenses/LICENSE +201 -0
brooder-0.1.0.dist-info/licenses/NOTICE +7 -0

brooder/__init__.py ADDED Viewed

@@ -0,0 +1,31 @@
+"""Brooder — snapshot testing for AI agents.
+Public API:
+    brooder.record(name)          decorator that captures an agent's runs
+    brooder.tool_call(...)        log a tool call / step into the active run
+    brooder.instrument(client)    auto-capture an LLM client's tool calls
+    brooder.claude_agent_hooks()  hooks mapping for the Claude Agent SDK
+"""
+from __future__ import annotations
+from importlib.metadata import PackageNotFoundError
+from importlib.metadata import version as _pkg_version
+from .integrations import instrument
+from .integrations.claude_agent import claude_agent_hooks
+from .recorder import record, tool_call, turn
+try:
+    __version__ = _pkg_version("brooder")
+except PackageNotFoundError:  # pragma: no cover - only when running from a non-installed tree
+    __version__ = "0.0.0+unknown"
+__all__ = [
+    "__version__",
+    "claude_agent_hooks",
+    "instrument",
+    "record",
+    "tool_call",
+    "turn",
+]

brooder/analysis.py ADDED Viewed

@@ -0,0 +1,79 @@
+"""Turn captured runs into verdicts.
+Separated from capture (``recorder``) so we can group multiple runs of the same case and
+detect flakiness. For each case:
+- more than one run that disagree with each other  -> FLAKY
+- otherwise, compare the representative run to its baseline -> PASS / REGRESSED / NEW
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import Optional
+from . import storage
+from .diffing import compare
+from .judges import ExactJudge, Judge
+from .models import Change, Diff, Run, Verdict
+_FLAKY_PENALTY = 40
+def _runs_agree(runs: list[Run], judge: Judge, observe_results: bool) -> bool:
+    first = runs[0]
+    return all(
+        not compare(first, other, judge, observe_results=observe_results).changes
+        for other in runs[1:]
+    )
+def analyze(
+    runs: list[Run],
+    base: Optional[Path] = None,
+    judge: Optional[Judge] = None,
+    observe_results: bool = False,
+) -> list[Diff]:
+    """Group captured runs by case and produce one diff per case.
+    A case whose repeated runs disagree with each other is reported as ``FLAKY``; otherwise
+    the representative run is compared to its baseline.
+    Args:
+        runs: The captured runs (possibly several per case when using ``--runs``).
+        base: Project root; defaults to the current working directory.
+        judge: Output-equivalence judge; defaults to :class:`~brooder.judges.ExactJudge`.
+        observe_results: If ``True``, also diff tool observations through the judge.
+    Returns:
+        One :class:`~brooder.models.Diff` per distinct case, in first-seen order.
+    """
+    judge = judge or ExactJudge()
+    groups: dict[tuple[str, str], list[Run]] = {}
+    for run in runs:
+        groups.setdefault((run.agent, run.case_id), []).append(run)
+    diffs: list[Diff] = []
+    for (agent, case_id), case_runs in groups.items():
+        if len(case_runs) > 1 and not _runs_agree(case_runs, judge, observe_results):
+            diffs.append(
+                Diff(
+                    agent=agent,
+                    case_id=case_id,
+                    verdict=Verdict.FLAKY,
+                    changes=[
+                        Change(
+                            path="run",
+                            kind="changed",
+                            before="deterministic",
+                            after=f"{len(case_runs)} runs diverge",
+                        )
+                    ],
+                    stability=max(0, 100 - _FLAKY_PENALTY),
+                )
+            )
+        else:
+            baseline = storage.load_baseline(agent, case_id, base)
+            diffs.append(compare(baseline, case_runs[0], judge, observe_results=observe_results))
+    return diffs

brooder/cli.py ADDED Viewed

@@ -0,0 +1,281 @@
+"""The ``brooder`` command-line interface."""
+from __future__ import annotations
+import os
+import runpy
+import sys
+from enum import Enum
+from pathlib import Path
+from typing import Optional
+import typer
+from rich.panel import Panel
+from . import __version__, metrics, recorder, storage
+from .analysis import analyze
+from .config import DEFAULT_CONFIG_YAML, BrooderConfig, load_config
+from .errors import BrooderError, ScriptNotFoundError
+from .judges import make_judge
+from .log import setup_logging
+from .models import Diff, Run
+from .report import (
+    console,
+    print_diff_detail,
+    print_summary,
+    render_json,
+    render_markdown,
+)
+class OutputFormat(str, Enum):
+    """How results are rendered to stdout."""
+    table = "table"
+    json = "json"
+    markdown = "markdown"
+app = typer.Typer(
+    add_completion=False,
+    no_args_is_help=True,
+    help="Snapshot testing for AI agents — catch behavior regressions before they ship.",
+)
+@app.callback(invoke_without_command=True)
+def _root(
+    verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
+    version: bool = typer.Option(False, "--version", help="Show version and exit."),
+) -> None:
+    setup_logging(verbose)
+    if version:
+        console.print(f"brooder {__version__}")
+        raise typer.Exit()
+def _exec_script(
+    script: str,
+    mode: str,
+    model: Optional[str] = None,
+    times: int = 1,
+    max_steps: int = 0,
+) -> list[Run]:
+    """Execute a user script under Brooder ``times`` times; return the captured runs."""
+    path = Path(script)
+    if not path.exists():
+        raise ScriptNotFoundError(f"script not found: {script}")
+    recorder.reset_session()
+    os.environ["BROODER_MODE"] = mode
+    os.environ["BROODER_MAX_STEPS"] = str(max_steps)
+    if model:
+        os.environ["BROODER_MODEL"] = model
+    else:
+        os.environ.pop("BROODER_MODEL", None)
+    saved_argv = sys.argv
+    sys.argv = [str(path)]
+    try:
+        for index in range(max(1, times)):
+            os.environ["BROODER_RUN_INDEX"] = str(index)
+            runpy.run_path(str(path), run_name="__main__")
+    finally:
+        sys.argv = saved_argv
+        os.environ.pop("BROODER_MODE", None)
+        os.environ.pop("BROODER_RUN_INDEX", None)
+        os.environ.pop("BROODER_MAX_STEPS", None)
+    return list(recorder.SESSION)
+def _analyze(runs: list[Run], cfg: BrooderConfig) -> list[Diff]:
+    return analyze(
+        runs,
+        judge=make_judge(cfg.judge),
+        observe_results=cfg.trajectory.observations,
+    )
+def _regressions(diffs: list[Diff]) -> list[Diff]:
+    return [diff for diff in diffs if not diff.ok]
+def _resolve_format(output_format: OutputFormat, json_flag: bool) -> OutputFormat:
+    """``--json`` is a convenience alias for ``--format json`` and wins if both are given."""
+    return OutputFormat.json if json_flag else output_format
+def _emit(diffs: list[Diff], fmt: OutputFormat) -> None:
+    """Render results to stdout in the requested format.
+    ``table`` prints the human summary plus a detail panel per regression; ``json`` / ``markdown``
+    print *only* the serialized document so stdout stays machine-parseable.
+    """
+    if fmt is OutputFormat.json:
+        print(render_json(diffs))
+    elif fmt is OutputFormat.markdown:
+        print(render_markdown(diffs))
+    else:
+        print_summary(diffs)
+        for diff in _regressions(diffs):
+            print_diff_detail(diff)
+def _emit_metrics(diffs: list[Diff], cfg: BrooderConfig) -> None:
+    """Emit OTLP metrics if an endpoint is configured (config or env); best-effort, never raises."""
+    metrics.emit(diffs, endpoint=cfg.metrics.otlp_endpoint)
+def _status(message: str, fmt: OutputFormat) -> None:
+    """Print human status chatter only in table mode, so json/markdown stdout stays clean."""
+    if fmt is OutputFormat.table:
+        console.print(message)
+@app.command()
+def init() -> None:
+    """Create brooder.yaml and the local .brooder/ directory."""
+    config_file = Path("brooder.yaml")
+    if not config_file.exists():
+        config_file.write_text(DEFAULT_CONFIG_YAML, encoding="utf-8")
+        console.print("[green]Created brooder.yaml[/]")
+    else:
+        console.print("brooder.yaml already exists")
+    (Path(".brooder") / "baselines").mkdir(parents=True, exist_ok=True)
+    console.print("Ready. Record baselines with: [bold]brooder record <script>[/]")
+@app.command()
+def record(script: str) -> None:
+    """Record an agent's real runs as golden baselines."""
+    cfg = load_config()
+    captured = _exec_script(script, "record", max_steps=cfg.trajectory.max_steps)
+    console.print(f"[green]Recorded {len(captured)} baseline(s).[/]")
+@app.command()
+def run(
+    script: str,
+    model: Optional[str] = typer.Option(None, "--model", help="Label this run's model."),
+    runs: Optional[int] = typer.Option(None, "--runs", help="Repeat each case N times."),
+    output_format: OutputFormat = typer.Option(
+        OutputFormat.table, "--format", help="Output format: table | json | markdown."
+    ),
+    json_out: bool = typer.Option(False, "--json", help="Shortcut for --format json."),
+) -> None:
+    """Re-run an agent and diff its behavior against the baselines."""
+    cfg = load_config()
+    fmt = _resolve_format(output_format, json_out)
+    times = runs if runs is not None else cfg.runs
+    captured = _exec_script(script, "run", model, times, cfg.trajectory.max_steps)
+    diffs = _analyze(captured, cfg)
+    _emit(diffs, fmt)
+    _emit_metrics(diffs, cfg)
+    regressions = _regressions(diffs)
+    if regressions:
+        _status(f"[red]{len(regressions)} issue(s) detected.[/]", fmt)
+        raise typer.Exit(1)
+    _status("[green]No regressions.[/]", fmt)
+@app.command()
+def diff(
+    output_format: OutputFormat = typer.Option(
+        OutputFormat.table, "--format", help="Output format: table | json | markdown."
+    ),
+    json_out: bool = typer.Option(False, "--json", help="Shortcut for --format json."),
+) -> None:
+    """Show the detailed behavioral diff for the latest runs."""
+    fmt = _resolve_format(output_format, json_out)
+    runs = list(storage.iter_runs())
+    if not runs:
+        _status("No runs yet. Run [bold]brooder run <script>[/] first.", fmt)
+        raise typer.Exit()
+    cfg = load_config()
+    diffs = list(analyze(runs, observe_results=cfg.trajectory.observations))
+    if fmt is OutputFormat.json:
+        print(render_json(diffs))
+    elif fmt is OutputFormat.markdown:
+        print(render_markdown(diffs))
+    else:
+        for stored in diffs:
+            print_diff_detail(stored)
+@app.command()
+def approve() -> None:
+    """Accept the latest runs as the new baselines (like `jest -u`)."""
+    count = storage.promote_runs_to_baselines()
+    console.print(f"[green]Promoted {count} run(s) to baselines.[/]")
+@app.command()
+def ci(
+    script: str,
+    output_format: OutputFormat = typer.Option(
+        OutputFormat.table, "--format", help="Output format: table | json | markdown."
+    ),
+    json_out: bool = typer.Option(False, "--json", help="Shortcut for --format json."),
+) -> None:
+    """CI mode: fail the build if regressions exceed the configured threshold."""
+    cfg = load_config()
+    fmt = _resolve_format(output_format, json_out)
+    captured = _exec_script(script, "run", times=cfg.runs, max_steps=cfg.trajectory.max_steps)
+    diffs = _analyze(captured, cfg)
+    _emit(diffs, fmt)
+    _emit_metrics(diffs, cfg)
+    regressions = _regressions(diffs)
+    if len(regressions) > cfg.regression_threshold:
+        _status(
+            f"[red]{len(regressions)} issue(s) > threshold {cfg.regression_threshold}. Failing.[/]",
+            fmt,
+        )
+        raise typer.Exit(1)
+    _status("[green]Within threshold. Passing.[/]", fmt)
+@app.command()
+def migrate(
+    script: str,
+    from_model: str = typer.Option(..., "--from", help="The model you're migrating FROM."),
+    to_model: str = typer.Option(..., "--to", help="The model you're migrating TO."),
+) -> None:
+    """Model Migration Report: what behavior changes if you switch models?"""
+    cfg = load_config()
+    max_steps = cfg.trajectory.max_steps
+    console.rule(f"Recording baseline on [cyan]{from_model}[/]")
+    _exec_script(script, "record", from_model, max_steps=max_steps)
+    console.rule(f"Re-running on [cyan]{to_model}[/]")
+    diffs = _analyze(_exec_script(script, "run", to_model, cfg.runs, max_steps), cfg)
+    print_summary(diffs)
+    changed = _regressions(diffs)
+    border = "red" if changed else "green"
+    console.print(
+        Panel(
+            f"[bold]{len(changed)} of {len(diffs)}[/] cases change behavior when migrating "
+            f"[cyan]{from_model}[/] → [cyan]{to_model}[/].",
+            title="Model Migration Report",
+            border_style=border,
+        )
+    )
+    for diff in changed:
+        print_diff_detail(diff)
+    if changed:
+        raise typer.Exit(1)
+def main() -> None:
+    """Entry point with a single error boundary for user-facing errors."""
+    try:
+        app()
+    except BrooderError as exc:
+        console.print(f"[red]error:[/] {exc}")
+        raise SystemExit(2) from exc
+if __name__ == "__main__":
+    main()

brooder/config.py ADDED Viewed

@@ -0,0 +1,88 @@
+"""Typed configuration loaded from an optional ``brooder.yaml``."""
+from __future__ import annotations
+from pathlib import Path
+from typing import Literal, Optional
+import yaml
+from pydantic import BaseModel, ConfigDict, Field, ValidationError
+from .errors import ConfigError
+CONFIG_FILE = "brooder.yaml"
+class TrajectoryConfig(BaseModel):
+    """Trajectory-specific diff and guardrail settings."""
+    model_config = ConfigDict(extra="forbid")
+    observations: bool = Field(
+        default=False, description="Also diff tool observations (results) via the judge."
+    )
+    max_steps: int = Field(
+        default=0, ge=0, description="0 = unlimited; >0 aborts and flags runaway loops."
+    )
+class MetricsConfig(BaseModel):
+    """Optional machine-readable metric emission settings."""
+    model_config = ConfigDict(extra="forbid")
+    otlp_endpoint: Optional[str] = Field(
+        default=None,
+        description="OTLP endpoint for metric emission; also honors OTEL_EXPORTER_OTLP_ENDPOINT.",
+    )
+class BrooderConfig(BaseModel):
+    """Validated project configuration."""
+    model_config = ConfigDict(extra="forbid")
+    judge: Literal["exact", "llm"] = "exact"
+    regression_threshold: int = Field(default=0, ge=0)
+    runs: int = Field(default=1, ge=1)
+    trajectory: TrajectoryConfig = Field(default_factory=TrajectoryConfig)
+    metrics: MetricsConfig = Field(default_factory=MetricsConfig)
+def load_config(base: Optional[Path] = None) -> BrooderConfig:
+    """Load and validate ``brooder.yaml``.
+    Args:
+        base: Project root; defaults to the current working directory.
+    Returns:
+        The validated configuration, or defaults if the file is absent.
+    Raises:
+        ConfigError: If the file exists but contains invalid YAML or invalid values.
+    """
+    path = (base or Path.cwd()) / CONFIG_FILE
+    if not path.exists():
+        return BrooderConfig()
+    try:
+        data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
+    except yaml.YAMLError as exc:
+        raise ConfigError(f"invalid YAML in {path}: {exc}") from exc
+    try:
+        return BrooderConfig.model_validate(data)
+    except ValidationError as exc:
+        raise ConfigError(f"invalid config in {path}: {exc}") from exc
+DEFAULT_CONFIG_YAML = """# Brooder configuration
+judge: exact              # exact | llm  (llm-judge is on the roadmap)
+regression_threshold: 0   # max regressed cases allowed before `brooder ci` fails
+runs: 1                   # executions per case (flakiness scoring is on the roadmap)
+trajectory:
+  observations: false     # also diff tool results (via the judge); off by default (noisy)
+  max_steps: 0            # 0 = unlimited; >0 aborts a run and flags runaway loops
+# metrics:                # optional OTLP metric emission (needs the `otel` extra)
+#   otlp_endpoint: http://localhost:4318/v1/metrics   # or set OTEL_EXPORTER_OTLP_ENDPOINT
+"""

brooder/diffing.py ADDED Viewed

@@ -0,0 +1,217 @@
+"""The behavioral diff engine — the core of Brooder.
+Compares a fresh run to its golden baseline by aligning their **trajectories**. Each step is
+reduced to a signature ``(kind, name, args)`` and the two step sequences are aligned with an LCS
+diff, so an inserted, dropped, or reordered step is reported at the position where the path
+*diverged* — it doesn't cascade into every later step. Turn- and step-counts are emitted as
+aggregate signals. The final *output* is compared through a :class:`~brooder.judges.Judge` so
+semantically-equivalent wording is not a regression; tool observations are compared separately and
+more leniently (see ``design/trajectory.md``). Flakiness scoring lives in ``analysis``.
+"""
+from __future__ import annotations
+import difflib
+import json
+from typing import Any, Optional
+from .judges import ExactJudge, Judge
+from .models import Change, Diff, Run, Step, StepKind, Verdict
+_CHANGE_PENALTY = 20  # stability points lost per behavioral change
+def compare(
+    baseline: Optional[Run],
+    current: Run,
+    judge: Optional[Judge] = None,
+    *,
+    observe_results: bool = False,
+) -> Diff:
+    """Compare a run to its baseline and produce a behavioral diff.
+    The trajectories are sequence-aligned (LCS) so added, dropped, or reordered steps are reported
+    where the path diverges. The final output is compared through ``judge`` so that
+    semantically-equivalent wording is not reported as a regression. Guardrail terminals
+    (``runaway`` / ``gave up``) are surfaced as distinct signals.
+    Args:
+        baseline: The golden run, or ``None`` if the case has no baseline yet.
+        current: The run under test.
+        judge: Output-equivalence judge; defaults to :class:`~brooder.judges.ExactJudge`.
+        observe_results: If ``True``, also diff aligned tool observations (results) through
+            ``judge``. Off by default because tool output is often noisy.
+    Returns:
+        A :class:`~brooder.models.Diff` with a verdict of ``NEW`` (no baseline),
+        ``PASS`` (no changes), or ``REGRESSED`` (changes found).
+    """
+    judge = judge or ExactJudge()
+    if baseline is None:
+        return Diff(
+            agent=current.agent,
+            case_id=current.case_id,
+            verdict=Verdict.NEW,
+            stability=100,
+        )
+    changes: list[Change] = []
+    _diff_trajectory(baseline, current, changes, judge, observe_results)
+    if not judge.equivalent(baseline.output, current.output):
+        changes.append(
+            Change(path="output", kind="changed", before=baseline.output, after=current.output)
+        )
+    verdict = Verdict.PASS if not changes else Verdict.REGRESSED
+    stability = 100 if not changes else max(0, 100 - len(changes) * _CHANGE_PENALTY)
+    return Diff(
+        agent=current.agent,
+        case_id=current.case_id,
+        verdict=verdict,
+        changes=changes,
+        stability=stability,
+    )
+def _step_signature(step: Step) -> tuple[str, str, str]:
+    """A stable, hashable identity for a step: its kind, name, and canonical args.
+    Observations (a ``TOOL`` step's ``result``) are deliberately excluded — they're diffed
+    separately and more leniently, so noisy tool output doesn't distort path alignment. A ``FINAL``
+    step carries the output in ``result``, so output changes never surface here; the output judge
+    owns them.
+    """
+    return (step.kind.value, step.name, json.dumps(step.args, sort_keys=True, default=str))
+def _step_summary(step: Step) -> dict[str, Any]:
+    """A compact, path-focused view of a step for a :class:`Change` payload (no observation)."""
+    return {"kind": step.kind.value, "name": step.name, "args": step.args}
+def _diff_trajectory(
+    baseline: Run,
+    current: Run,
+    changes: list[Change],
+    judge: Judge,
+    observe_results: bool,
+) -> None:
+    """Align the two trajectories with an LCS diff, emitting per-step and aggregate changes."""
+    b, c = baseline.trajectory, current.trajectory
+    matcher = difflib.SequenceMatcher(
+        a=[_step_signature(s) for s in b],
+        b=[_step_signature(s) for s in c],
+        autojunk=False,
+    )
+    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
+        if tag == "equal":
+            if observe_results:
+                _diff_observations(b, c, i1, i2, j1, judge, changes)
+            continue
+        if tag == "delete":
+            for i in range(i1, i2):
+                changes.append(
+                    Change(path=f"trajectory[{i}]", kind="removed", before=_step_summary(b[i]))
+                )
+        elif tag == "insert":
+            for j in range(j1, j2):
+                changes.append(
+                    Change(path=f"trajectory[{j}]", kind="added", after=_step_summary(c[j]))
+                )
+        else:  # "replace": pair aligned positions, spill any leftovers as add/remove
+            for k in range(max(i2 - i1, j2 - j1)):
+                bi, cj = i1 + k, j1 + k
+                if bi < i2 and cj < j2:
+                    changes.append(
+                        Change(
+                            path=f"trajectory[{bi}]",
+                            kind="changed",
+                            before=_step_summary(b[bi]),
+                            after=_step_summary(c[cj]),
+                        )
+                    )
+                elif cj < j2:
+                    changes.append(
+                        Change(path=f"trajectory[{cj}]", kind="added", after=_step_summary(c[cj]))
+                    )
+                else:
+                    changes.append(
+                        Change(
+                            path=f"trajectory[{bi}]",
+                            kind="removed",
+                            before=_step_summary(b[bi]),
+                        )
+                    )
+    _diff_guardrails(baseline, current, changes)
+    if baseline.turns != current.turns:
+        changes.append(
+            Change(
+                path="trajectory.turns", kind="changed", before=baseline.turns, after=current.turns
+            )
+        )
+    if baseline.step_count != current.step_count:
+        changes.append(
+            Change(
+                path="trajectory.steps",
+                kind="changed",
+                before=baseline.step_count,
+                after=current.step_count,
+            )
+        )
+def _diff_guardrails(baseline: Run, current: Run, changes: list[Change]) -> None:
+    """Emit a guardrail signal when a run's terminal condition changed (runaway or gave-up).
+    ``runaway`` takes precedence: a runaway run also lacks a ``FINAL`` step, so reporting both would
+    be redundant — the runaway signal already explains why the agent produced no answer.
+    """
+    if baseline.runaway != current.runaway:
+        changes.append(
+            Change(
+                path="trajectory.runaway",
+                kind="changed",
+                before=baseline.runaway,
+                after=current.runaway,
+            )
+        )
+    elif baseline.gave_up != current.gave_up:
+        changes.append(
+            Change(
+                path="trajectory.gave_up",
+                kind="changed",
+                before=baseline.gave_up,
+                after=current.gave_up,
+            )
+        )
+def _diff_observations(
+    b: list[Step],
+    c: list[Step],
+    i1: int,
+    i2: int,
+    j1: int,
+    judge: Judge,
+    changes: list[Change],
+) -> None:
+    """Diff the observations of aligned (equal-signature) ``TOOL`` steps through the judge.
+    Only ``TOOL`` steps carry observations worth comparing; a ``FINAL`` step's result is the output
+    (already judged), and a ``TURN`` has none. Opt-in via ``trajectory.observations``.
+    """
+    for offset in range(i2 - i1):
+        before, after = b[i1 + offset], c[j1 + offset]
+        if before.kind is StepKind.TOOL and not judge.equivalent(before.result, after.result):
+            changes.append(
+                Change(
+                    path=f"trajectory[{j1 + offset}].result",
+                    kind="changed",
+                    before=before.result,
+                    after=after.result,
+                )
+            )