PyPI - caliper-eval - Versions diffs - 0.1.0__py3-none-any.whl - Mend

caliper-eval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

caliper/__init__.py +0 -0
caliper/commands/__init__.py +0 -0
caliper/commands/install_skill.py +56 -0
caliper/commands/list_cmd.py +94 -0
caliper/commands/new.py +16 -0
caliper/commands/report.py +56 -0
caliper/commands/run.py +110 -0
caliper/commands/validate.py +66 -0
caliper/harness/__init__.py +31 -0
caliper/harness/base.py +50 -0
caliper/harness/claude_api.py +74 -0
caliper/harness/claude_code.py +372 -0
caliper/harness/codex.py +278 -0
caliper/harness/openai_api.py +61 -0
caliper/judge/__init__.py +42 -0
caliper/judge/autorater.py +129 -0
caliper/judge/base.py +28 -0
caliper/judge/claude_code_judge.py +92 -0
caliper/judge/codex_judge.py +180 -0
caliper/judge/openai_api_judge.py +89 -0
caliper/judge/script_assert.py +210 -0
caliper/main.py +30 -0
caliper/reporter.py +223 -0
caliper/resources/__init__.py +1 -0
caliper/resources/evaluate_skill/SKILL.md +252 -0
caliper/resources/evaluate_skill/__init__.py +1 -0
caliper/runner.py +308 -0
caliper/schema/__init__.py +0 -0
caliper/schema/results.py +75 -0
caliper/schema/spec.py +87 -0
caliper/scoring.py +33 -0
caliper/wizard.py +159 -0
caliper_eval-0.1.0.dist-info/METADATA +588 -0
caliper_eval-0.1.0.dist-info/RECORD +36 -0
caliper_eval-0.1.0.dist-info/WHEEL +4 -0
caliper_eval-0.1.0.dist-info/entry_points.txt +2 -0

caliper/__init__.py ADDED Viewed

File without changes

caliper/commands/__init__.py ADDED Viewed

File without changes

caliper/commands/install_skill.py ADDED Viewed

@@ -0,0 +1,56 @@
+from __future__ import annotations
+from importlib.resources import files
+from pathlib import Path
+from typing import Annotated
+import typer
+from click import ClickException
+SKILL_PACKAGE = "caliper.resources.evaluate_skill"
+SKILL_FILENAME = "SKILL.md"
+TARGETS = {
+    "codex": Path(".codex") / "skills" / "evaluate-skill" / "SKILL.md",
+    "claude-code": Path(".claude") / "commands" / "evaluate-skill.md",
+}
+def load_packaged_skill() -> str:
+    return files(SKILL_PACKAGE).joinpath(SKILL_FILENAME).read_text(encoding="utf-8")
+def install_skill_cmd(
+    target: Annotated[
+        str,
+        typer.Argument(help="Agent target to install for: codex or claude-code"),
+    ],
+    force: Annotated[
+        bool,
+        typer.Option("--force", help="Overwrite an existing installed skill"),
+    ] = False,
+    dry_run: Annotated[
+        bool,
+        typer.Option("--dry-run", help="Show the destination without writing files"),
+    ] = False,
+) -> None:
+    normalized = target.strip().lower()
+    if normalized not in TARGETS:
+        valid = ", ".join(sorted(TARGETS))
+        raise typer.BadParameter(f"unsupported target '{target}'. Choose one of: {valid}")
+    destination = Path.home() / TARGETS[normalized]
+    skill_text = load_packaged_skill()
+    if dry_run:
+        typer.echo(f"Would install evaluate-skill for {normalized} to {destination}")
+        return
+    if destination.exists() and not force:
+        raise ClickException(
+            f"{destination} already exists. Rerun with --force to overwrite it."
+        )
+    destination.parent.mkdir(parents=True, exist_ok=True)
+    destination.write_text(skill_text, encoding="utf-8")
+    typer.echo(f"Installed evaluate-skill for {normalized} to {destination}")

caliper/commands/list_cmd.py ADDED Viewed

@@ -0,0 +1,94 @@
+from __future__ import annotations
+from pathlib import Path
+from typing import Annotated, Optional
+import typer
+from rich import box
+from rich.console import Console
+from rich.table import Table
+from caliper.schema.results import RunResults
+console = Console()
+def list_cmd_fn(
+    spec: Annotated[Optional[str], typer.Argument(help="Spec name to list runs for")] = None,
+    directory: Annotated[Path, typer.Option("--dir", help="Directory to search")] = Path("."),
+) -> None:
+    caliper_dir = directory / ".caliper" / "results"
+    if spec:
+        _list_runs(caliper_dir / spec, spec)
+    else:
+        _list_specs(caliper_dir)
+def _list_specs(results_dir: Path) -> None:
+    if not results_dir.exists():
+        console.print("[dim]No evaluation results found. Run [bold]caliper run[/bold] first.[/dim]")
+        return
+    table = Table(box=box.ROUNDED, header_style="bold cyan", expand=False)
+    table.add_column("Spec")
+    table.add_column("Runs", justify="right")
+    table.add_column("Latest run")
+    table.add_column("pass@k", justify="right")
+    for spec_dir in sorted(results_dir.iterdir()):
+        if not spec_dir.is_dir():
+            continue
+        files = sorted(spec_dir.glob("*.json"))
+        if not files:
+            continue
+        latest_file = files[-1]
+        try:
+            results = RunResults.model_validate_json(latest_file.read_text())
+            ts = results.run.timestamp.strftime("%Y-%m-%d %H:%M")
+            score = f"{results.aggregate.avg_pass_at_k * 100:.1f}%"
+        except Exception:
+            ts = latest_file.stem
+            score = "?"
+        table.add_row(spec_dir.name, str(len(files)), ts, score)
+    if table.row_count == 0:
+        console.print("[dim]No results yet.[/dim]")
+    else:
+        console.print(table)
+def _list_runs(spec_dir: Path, spec_name: str) -> None:
+    if not spec_dir.exists():
+        console.print(f"[bold red]Error:[/bold red] No results for spec {spec_name!r}")
+        raise typer.Exit(1)
+    files = sorted(spec_dir.glob("*.json"))
+    if not files:
+        console.print(f"[dim]No runs for {spec_name}.[/dim]")
+        return
+    table = Table(box=box.ROUNDED, header_style="bold cyan", expand=False)
+    table.add_column("Timestamp")
+    table.add_column("k", justify="right")
+    table.add_column("Tasks", justify="right")
+    table.add_column("pass@k", justify="right")
+    table.add_column("Judge")
+    table.add_column("File", style="dim")
+    for f in files:
+        try:
+            results = RunResults.model_validate_json(f.read_text())
+            ts = results.run.timestamp.strftime("%Y-%m-%d %H:%M:%S")
+            score = f"{results.aggregate.avg_pass_at_k * 100:.1f}%"
+            k = str(results.run.k)
+            n_tasks = str(len(results.task_results))
+            judge = results.run.judge_strategy
+        except Exception:
+            ts = f.stem
+            score = k = n_tasks = judge = "?"
+        table.add_row(ts, k, n_tasks, score, judge, f.name)
+    console.print(table)

caliper/commands/new.py ADDED Viewed

@@ -0,0 +1,16 @@
+from __future__ import annotations
+from pathlib import Path
+from typing import Annotated, Optional
+import typer
+from caliper.wizard import run_wizard
+def new_cmd(
+    name: Annotated[Optional[str], typer.Argument(help="Eval name (used as default filename)")] = None,
+    skill: Annotated[Optional[str], typer.Option("--skill", help="Pre-populate skill path")] = None,
+    backend: Annotated[str, typer.Option("--backend", help="Pre-populate backend")] = "claude-code",
+    output: Annotated[Optional[Path], typer.Option("--out", help="Output path for .eval.yaml")] = None,
+) -> None:
+    run_wizard(name=name, output=output, skill_path=skill, backend=backend)

caliper/commands/report.py ADDED Viewed

@@ -0,0 +1,56 @@
+from __future__ import annotations
+from pathlib import Path
+from typing import Annotated, Optional
+import typer
+from rich.console import Console
+from caliper.reporter import print_results, results_to_json
+from caliper.schema.results import RunResults
+console = Console()
+def report_cmd(
+    spec_or_file: Annotated[str, typer.Argument(help="Spec name or path to results JSON")],
+    run: Annotated[Optional[str], typer.Option("--run", help="Specific run timestamp")] = None,
+    fmt: Annotated[str, typer.Option("--format", "-f", help="Output format: table | json")] = "table",
+    verbose: Annotated[bool, typer.Option("--verbose", "-v")] = False,
+) -> None:
+    results_path = _resolve_path(spec_or_file, run)
+    if results_path is None:
+        console.print(f"[bold red]Error:[/bold red] No results found for {spec_or_file!r}")
+        raise typer.Exit(1)
+    try:
+        results = RunResults.model_validate_json(results_path.read_text())
+    except Exception as exc:
+        console.print(f"[bold red]Error parsing results:[/bold red] {exc}")
+        raise typer.Exit(1)
+    if fmt == "json":
+        console.print_json(results_to_json(results))
+    else:
+        print_results(results, verbose=verbose)
+def _resolve_path(spec_or_file: str, run: str | None) -> Path | None:
+    p = Path(spec_or_file)
+    # Direct JSON path
+    if p.suffix == ".json" and p.exists():
+        return p
+    # Spec name → look in .caliper/results/<name>/
+    results_dir = Path(".caliper") / "results" / spec_or_file
+    if not results_dir.exists():
+        return None
+    if run:
+        candidate = results_dir / f"{run}.json"
+        return candidate if candidate.exists() else None
+    # Latest = lexicographically last (ISO timestamps sort correctly)
+    files = sorted(results_dir.glob("*.json"))
+    return files[-1] if files else None

caliper/commands/run.py ADDED Viewed

@@ -0,0 +1,110 @@
+from __future__ import annotations
+from pathlib import Path
+from typing import Optional
+import typer
+from rich.console import Console
+from rich.panel import Panel
+from caliper.harness.base import HarnessConfigurationError
+from caliper.harness import get_harness
+from caliper.judge import get_judge
+from caliper.reporter import (
+    make_progress,
+    print_banner,
+    print_results,
+    save_results,
+    update_progress,
+)
+from caliper.runner import TaskRunner
+from caliper.schema.spec import load_spec, spec_name
+console = Console()
+def run_cmd(
+    spec_file: Path = typer.Argument(..., help="Path to .eval.yaml spec file"),
+    k: int = typer.Option(3, "--k", help="Attempts per task"),
+    workers: int = typer.Option(4, "--workers", help="Parallel task workers"),
+    timeout: int = typer.Option(120, "--timeout", help="Seconds per attempt"),
+    baseline: bool = typer.Option(False, "--baseline", help="Also run without skill for delta"),
+    judge_strategy: str = typer.Option("autorater", "--judge", help="Judge strategy: autorater | script"),
+    output: Optional[Path] = typer.Option(None, "--output", help="Save results JSON to path"),
+    verbose: bool = typer.Option(False, "--verbose", "-v", help="Show per-attempt reasoning"),
+    model: Optional[str] = typer.Option(None, "--model", "-m", help="Override the skill model (e.g. claude-sonnet-4-6)"),
+) -> None:
+    if not spec_file.exists():
+        console.print(f"[bold red]Error:[/bold red] File not found: {spec_file}")
+        raise typer.Exit(1)
+    try:
+        spec = load_spec(spec_file)
+    except Exception as exc:
+        console.print(f"[bold red]Invalid spec:[/bold red] {exc}")
+        raise typer.Exit(1)
+    if model:
+        spec.skill.model = model
+    name = spec_name(spec_file)
+    print_banner(name, k, spec.skill.backend)
+    harness = get_harness(spec.skill.backend, spec.skill.model)
+    judge = get_judge(judge_strategy, spec.judge)
+    task_names = [t.name for t in spec.tasks]
+    progress, task_ids = make_progress(task_names, k)
+    attempt_counts: dict[str, int] = {t.name: 0 for t in spec.tasks}
+    pass_counts: dict[str, int] = {t.name: 0 for t in spec.tasks}
+    def on_attempt_done(task_id: str, attempt: int, passed: bool, cheated: bool) -> None:
+        task = next((t for t in spec.tasks if t.id == task_id), None)
+        if task is None:
+            return
+        attempt_counts[task.name] += 1
+        if passed:
+            pass_counts[task.name] += 1
+        update_progress(
+            progress,
+            task_ids,
+            task.name,
+            k,
+            attempt_counts[task.name],
+            pass_counts[task.name],
+            cheated=cheated,
+        )
+    runner = TaskRunner(
+        harness=harness,
+        judge=judge,
+        spec=spec,
+        spec_path=spec_file,
+        k=k,
+        workers=workers,
+        timeout=timeout,
+        baseline=baseline,
+        judge_strategy=judge_strategy,
+        on_attempt_done=on_attempt_done,
+    )
+    with progress:
+        try:
+            results = runner.run()
+        except HarnessConfigurationError as exc:
+            console.print(
+                Panel(
+                    str(exc),
+                    title="[bold red]Backend configuration error[/bold red]",
+                    border_style="red",
+                )
+            )
+            raise typer.Exit(2)
+    saved_path = save_results(results, str(spec_file))
+    if output:
+        Path(output).write_text(results.model_dump_json(indent=2))
+    print_results(results, verbose=verbose)
+    console.print(f"[dim]Results saved to {saved_path}[/dim]")

caliper/commands/validate.py ADDED Viewed

@@ -0,0 +1,66 @@
+from pathlib import Path
+import typer
+from pydantic import ValidationError
+from rich.console import Console
+from rich.panel import Panel
+from caliper.schema.spec import load_spec, spec_name
+console = Console()
+def _supports_unicode() -> bool:
+    encoding = getattr(console.file, "encoding", None) or ""
+    return "utf" in encoding.lower()
+CHECK = "✓" if _supports_unicode() else "OK"
+ARROW = "→" if _supports_unicode() else "->"
+def validate_cmd(
+    spec_file: Path = typer.Argument(..., help="Path to .eval.yaml spec file"),
+) -> None:
+    if not spec_file.exists():
+        console.print(f"[bold red]Error:[/bold red] File not found: {spec_file}")
+        raise typer.Exit(1)
+    try:
+        spec = load_spec(spec_file)
+    except ValidationError as exc:
+        console.print(
+            Panel(
+                _format_validation_errors(exc),
+                title="[bold red]Validation failed[/bold red]",
+                border_style="red",
+            )
+        )
+        raise typer.Exit(1)
+    except Exception as exc:
+        console.print(f"[bold red]Error parsing YAML:[/bold red] {exc}")
+        raise typer.Exit(1)
+    name = spec_name(spec_file)
+    n_tasks = len(spec.tasks)
+    backend = spec.skill.backend
+    console.print(
+        Panel(
+            f"[bold]{name}[/bold]\n"
+            f"  backend  [cyan]{backend}[/cyan]\n"
+            f"  tasks    [cyan]{n_tasks}[/cyan]\n"
+            f"  judge    [cyan]{spec.judge.backend}[/cyan]"
+            + (f" / [dim]{spec.judge.model}[/dim]" if spec.judge.model else ""),
+            title=f"[bold green]{CHECK} Spec is valid[/bold green]",
+            border_style="green",
+        )
+    )
+def _format_validation_errors(exc: ValidationError) -> str:
+    lines = []
+    for err in exc.errors():
+        loc = f" {ARROW} ".join(str(p) for p in err["loc"])
+        lines.append(f"  [dim]{loc}[/dim]  {err['msg']}")
+    return "\n".join(lines)

caliper/harness/__init__.py ADDED Viewed

@@ -0,0 +1,31 @@
+from caliper.harness.base import AttemptResult, ConversationTurn, HarnessBackend
+from caliper.harness.claude_code import ClaudeCodeHarness
+from caliper.harness.claude_api import ClaudeAPIHarness
+from caliper.harness.openai_api import OpenAIAPIHarness
+from caliper.schema.spec import normalize_backend
+def get_harness(backend: str, model: str | None = None) -> HarnessBackend:
+    match normalize_backend(backend):
+        case "claude-code":
+            return ClaudeCodeHarness(model=model)
+        case "codex":
+            from caliper.harness.codex import CodexHarness
+            return CodexHarness(model=model)
+        case "claude-api":
+            return ClaudeAPIHarness(model=model)
+        case "openai-api":
+            return OpenAIAPIHarness(model=model)
+        case _:
+            raise ValueError(f"Unknown backend: {backend!r}")
+__all__ = [
+    "AttemptResult",
+    "ConversationTurn",
+    "HarnessBackend",
+    "ClaudeCodeHarness",
+    "ClaudeAPIHarness",
+    "OpenAIAPIHarness",
+    "get_harness",
+]

caliper/harness/base.py ADDED Viewed

@@ -0,0 +1,50 @@
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+class HarnessConfigurationError(RuntimeError):
+    """Raised when a harness cannot run because local configuration is invalid."""
+@dataclass
+class ConversationTurn:
+    role: str
+    content: str
+    tool_name: str | None = None
+    tool_input: dict | None = None
+    tool_output: str | None = None
+@dataclass
+class AttemptResult:
+    task_id: str
+    attempt: int
+    transcript: list[ConversationTurn]
+    final_output: str
+    exit_code: int
+    duration_seconds: float
+    error: str | None = None
+    cheated: bool = False
+    cheat_evidence: list[str] = field(default_factory=list)
+class HarnessBackend(ABC):
+    @abstractmethod
+    def run(
+        self,
+        task_id: str,
+        attempt: int,
+        prompt: str,
+        *,
+        skill_path: str | None,
+        model: str | None,
+        timeout: int,
+        isolated_home: str,
+        extra_path: list[str] | None = None,
+    ) -> AttemptResult: ...
+    @property
+    @abstractmethod
+    def name(self) -> str: ...

caliper/harness/claude_api.py ADDED Viewed

@@ -0,0 +1,74 @@
+from __future__ import annotations
+import re
+import time
+from pathlib import Path
+from caliper.harness.base import AttemptResult, ConversationTurn, HarnessBackend
+class ClaudeAPIHarness(HarnessBackend):
+    def __init__(self, model: str | None = None) -> None:
+        self._model = model
+    @property
+    def name(self) -> str:
+        return "claude-api"
+    def run(
+        self,
+        task_id: str,
+        attempt: int,
+        prompt: str,
+        *,
+        skill_path: str | None,
+        model: str | None,
+        timeout: int,
+        isolated_home: str,
+        extra_path: list[str] | None = None,
+    ) -> AttemptResult:
+        full_prompt = self._inject_skill(prompt, skill_path)
+        effective_model = model or self._model or "claude-haiku-4-5-20251001"
+        start = time.monotonic()
+        output, exit_code, error = self._run_api(full_prompt, effective_model, timeout)
+        duration = time.monotonic() - start
+        transcript = [ConversationTurn(role="assistant", content=output)] if output else []
+        return AttemptResult(
+            task_id=task_id,
+            attempt=attempt,
+            transcript=transcript,
+            final_output=output,
+            exit_code=exit_code,
+            duration_seconds=duration,
+            error=error,
+        )
+    def _inject_skill(self, prompt: str, skill_path: str | None) -> str:
+        if not skill_path:
+            return prompt
+        skill_src = Path(skill_path).expanduser()
+        if not skill_src.exists():
+            return prompt
+        raw = skill_src.read_text()
+        body = re.sub(r"^---\n.*?\n---\n", "", raw, flags=re.DOTALL).strip()
+        return f"[Skill context]\n{body}\n[End skill context]\n\n{prompt}"
+    def _run_api(self, prompt: str, model: str, timeout: int) -> tuple[str, int, str | None]:
+        try:
+            import anthropic
+        except ImportError:
+            return "", 1, "anthropic package not installed; run: pip install caliper"
+        try:
+            client = anthropic.Anthropic(timeout=timeout)
+            response = client.messages.create(
+                model=model,
+                max_tokens=4096,
+                messages=[{"role": "user", "content": prompt}],
+            )
+            content = "".join(
+                block.text for block in response.content if getattr(block, "type", None) == "text"
+            )
+            return content.strip(), 0, None
+        except Exception as exc:
+            return "", 1, str(exc)