caliper-eval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
caliper/__init__.py ADDED
File without changes
File without changes
@@ -0,0 +1,56 @@
1
+ from __future__ import annotations
2
+
3
+ from importlib.resources import files
4
+ from pathlib import Path
5
+ from typing import Annotated
6
+
7
+ import typer
8
+ from click import ClickException
9
+
10
+ SKILL_PACKAGE = "caliper.resources.evaluate_skill"
11
+ SKILL_FILENAME = "SKILL.md"
12
+
13
+ TARGETS = {
14
+ "codex": Path(".codex") / "skills" / "evaluate-skill" / "SKILL.md",
15
+ "claude-code": Path(".claude") / "commands" / "evaluate-skill.md",
16
+ }
17
+
18
+
19
+ def load_packaged_skill() -> str:
20
+ return files(SKILL_PACKAGE).joinpath(SKILL_FILENAME).read_text(encoding="utf-8")
21
+
22
+
23
+ def install_skill_cmd(
24
+ target: Annotated[
25
+ str,
26
+ typer.Argument(help="Agent target to install for: codex or claude-code"),
27
+ ],
28
+ force: Annotated[
29
+ bool,
30
+ typer.Option("--force", help="Overwrite an existing installed skill"),
31
+ ] = False,
32
+ dry_run: Annotated[
33
+ bool,
34
+ typer.Option("--dry-run", help="Show the destination without writing files"),
35
+ ] = False,
36
+ ) -> None:
37
+ normalized = target.strip().lower()
38
+ if normalized not in TARGETS:
39
+ valid = ", ".join(sorted(TARGETS))
40
+ raise typer.BadParameter(f"unsupported target '{target}'. Choose one of: {valid}")
41
+
42
+ destination = Path.home() / TARGETS[normalized]
43
+ skill_text = load_packaged_skill()
44
+
45
+ if dry_run:
46
+ typer.echo(f"Would install evaluate-skill for {normalized} to {destination}")
47
+ return
48
+
49
+ if destination.exists() and not force:
50
+ raise ClickException(
51
+ f"{destination} already exists. Rerun with --force to overwrite it."
52
+ )
53
+
54
+ destination.parent.mkdir(parents=True, exist_ok=True)
55
+ destination.write_text(skill_text, encoding="utf-8")
56
+ typer.echo(f"Installed evaluate-skill for {normalized} to {destination}")
@@ -0,0 +1,94 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Annotated, Optional
5
+
6
+ import typer
7
+ from rich import box
8
+ from rich.console import Console
9
+ from rich.table import Table
10
+
11
+ from caliper.schema.results import RunResults
12
+
13
+ console = Console()
14
+
15
+
16
+ def list_cmd_fn(
17
+ spec: Annotated[Optional[str], typer.Argument(help="Spec name to list runs for")] = None,
18
+ directory: Annotated[Path, typer.Option("--dir", help="Directory to search")] = Path("."),
19
+ ) -> None:
20
+ caliper_dir = directory / ".caliper" / "results"
21
+
22
+ if spec:
23
+ _list_runs(caliper_dir / spec, spec)
24
+ else:
25
+ _list_specs(caliper_dir)
26
+
27
+
28
+ def _list_specs(results_dir: Path) -> None:
29
+ if not results_dir.exists():
30
+ console.print("[dim]No evaluation results found. Run [bold]caliper run[/bold] first.[/dim]")
31
+ return
32
+
33
+ table = Table(box=box.ROUNDED, header_style="bold cyan", expand=False)
34
+ table.add_column("Spec")
35
+ table.add_column("Runs", justify="right")
36
+ table.add_column("Latest run")
37
+ table.add_column("pass@k", justify="right")
38
+
39
+ for spec_dir in sorted(results_dir.iterdir()):
40
+ if not spec_dir.is_dir():
41
+ continue
42
+ files = sorted(spec_dir.glob("*.json"))
43
+ if not files:
44
+ continue
45
+ latest_file = files[-1]
46
+ try:
47
+ results = RunResults.model_validate_json(latest_file.read_text())
48
+ ts = results.run.timestamp.strftime("%Y-%m-%d %H:%M")
49
+ score = f"{results.aggregate.avg_pass_at_k * 100:.1f}%"
50
+ except Exception:
51
+ ts = latest_file.stem
52
+ score = "?"
53
+
54
+ table.add_row(spec_dir.name, str(len(files)), ts, score)
55
+
56
+ if table.row_count == 0:
57
+ console.print("[dim]No results yet.[/dim]")
58
+ else:
59
+ console.print(table)
60
+
61
+
62
+ def _list_runs(spec_dir: Path, spec_name: str) -> None:
63
+ if not spec_dir.exists():
64
+ console.print(f"[bold red]Error:[/bold red] No results for spec {spec_name!r}")
65
+ raise typer.Exit(1)
66
+
67
+ files = sorted(spec_dir.glob("*.json"))
68
+ if not files:
69
+ console.print(f"[dim]No runs for {spec_name}.[/dim]")
70
+ return
71
+
72
+ table = Table(box=box.ROUNDED, header_style="bold cyan", expand=False)
73
+ table.add_column("Timestamp")
74
+ table.add_column("k", justify="right")
75
+ table.add_column("Tasks", justify="right")
76
+ table.add_column("pass@k", justify="right")
77
+ table.add_column("Judge")
78
+ table.add_column("File", style="dim")
79
+
80
+ for f in files:
81
+ try:
82
+ results = RunResults.model_validate_json(f.read_text())
83
+ ts = results.run.timestamp.strftime("%Y-%m-%d %H:%M:%S")
84
+ score = f"{results.aggregate.avg_pass_at_k * 100:.1f}%"
85
+ k = str(results.run.k)
86
+ n_tasks = str(len(results.task_results))
87
+ judge = results.run.judge_strategy
88
+ except Exception:
89
+ ts = f.stem
90
+ score = k = n_tasks = judge = "?"
91
+
92
+ table.add_row(ts, k, n_tasks, score, judge, f.name)
93
+
94
+ console.print(table)
@@ -0,0 +1,16 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Annotated, Optional
5
+
6
+ import typer
7
+
8
+ from caliper.wizard import run_wizard
9
+
10
+ def new_cmd(
11
+ name: Annotated[Optional[str], typer.Argument(help="Eval name (used as default filename)")] = None,
12
+ skill: Annotated[Optional[str], typer.Option("--skill", help="Pre-populate skill path")] = None,
13
+ backend: Annotated[str, typer.Option("--backend", help="Pre-populate backend")] = "claude-code",
14
+ output: Annotated[Optional[Path], typer.Option("--out", help="Output path for .eval.yaml")] = None,
15
+ ) -> None:
16
+ run_wizard(name=name, output=output, skill_path=skill, backend=backend)
@@ -0,0 +1,56 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Annotated, Optional
5
+
6
+ import typer
7
+ from rich.console import Console
8
+
9
+ from caliper.reporter import print_results, results_to_json
10
+ from caliper.schema.results import RunResults
11
+
12
+ console = Console()
13
+
14
+
15
+ def report_cmd(
16
+ spec_or_file: Annotated[str, typer.Argument(help="Spec name or path to results JSON")],
17
+ run: Annotated[Optional[str], typer.Option("--run", help="Specific run timestamp")] = None,
18
+ fmt: Annotated[str, typer.Option("--format", "-f", help="Output format: table | json")] = "table",
19
+ verbose: Annotated[bool, typer.Option("--verbose", "-v")] = False,
20
+ ) -> None:
21
+ results_path = _resolve_path(spec_or_file, run)
22
+ if results_path is None:
23
+ console.print(f"[bold red]Error:[/bold red] No results found for {spec_or_file!r}")
24
+ raise typer.Exit(1)
25
+
26
+ try:
27
+ results = RunResults.model_validate_json(results_path.read_text())
28
+ except Exception as exc:
29
+ console.print(f"[bold red]Error parsing results:[/bold red] {exc}")
30
+ raise typer.Exit(1)
31
+
32
+ if fmt == "json":
33
+ console.print_json(results_to_json(results))
34
+ else:
35
+ print_results(results, verbose=verbose)
36
+
37
+
38
+ def _resolve_path(spec_or_file: str, run: str | None) -> Path | None:
39
+ p = Path(spec_or_file)
40
+
41
+ # Direct JSON path
42
+ if p.suffix == ".json" and p.exists():
43
+ return p
44
+
45
+ # Spec name → look in .caliper/results/<name>/
46
+ results_dir = Path(".caliper") / "results" / spec_or_file
47
+ if not results_dir.exists():
48
+ return None
49
+
50
+ if run:
51
+ candidate = results_dir / f"{run}.json"
52
+ return candidate if candidate.exists() else None
53
+
54
+ # Latest = lexicographically last (ISO timestamps sort correctly)
55
+ files = sorted(results_dir.glob("*.json"))
56
+ return files[-1] if files else None
@@ -0,0 +1,110 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Optional
5
+
6
+ import typer
7
+ from rich.console import Console
8
+ from rich.panel import Panel
9
+
10
+ from caliper.harness.base import HarnessConfigurationError
11
+ from caliper.harness import get_harness
12
+ from caliper.judge import get_judge
13
+ from caliper.reporter import (
14
+ make_progress,
15
+ print_banner,
16
+ print_results,
17
+ save_results,
18
+ update_progress,
19
+ )
20
+ from caliper.runner import TaskRunner
21
+ from caliper.schema.spec import load_spec, spec_name
22
+
23
+ console = Console()
24
+
25
+
26
+ def run_cmd(
27
+ spec_file: Path = typer.Argument(..., help="Path to .eval.yaml spec file"),
28
+ k: int = typer.Option(3, "--k", help="Attempts per task"),
29
+ workers: int = typer.Option(4, "--workers", help="Parallel task workers"),
30
+ timeout: int = typer.Option(120, "--timeout", help="Seconds per attempt"),
31
+ baseline: bool = typer.Option(False, "--baseline", help="Also run without skill for delta"),
32
+ judge_strategy: str = typer.Option("autorater", "--judge", help="Judge strategy: autorater | script"),
33
+ output: Optional[Path] = typer.Option(None, "--output", help="Save results JSON to path"),
34
+ verbose: bool = typer.Option(False, "--verbose", "-v", help="Show per-attempt reasoning"),
35
+ model: Optional[str] = typer.Option(None, "--model", "-m", help="Override the skill model (e.g. claude-sonnet-4-6)"),
36
+ ) -> None:
37
+ if not spec_file.exists():
38
+ console.print(f"[bold red]Error:[/bold red] File not found: {spec_file}")
39
+ raise typer.Exit(1)
40
+
41
+ try:
42
+ spec = load_spec(spec_file)
43
+ except Exception as exc:
44
+ console.print(f"[bold red]Invalid spec:[/bold red] {exc}")
45
+ raise typer.Exit(1)
46
+
47
+ if model:
48
+ spec.skill.model = model
49
+
50
+ name = spec_name(spec_file)
51
+ print_banner(name, k, spec.skill.backend)
52
+
53
+ harness = get_harness(spec.skill.backend, spec.skill.model)
54
+ judge = get_judge(judge_strategy, spec.judge)
55
+
56
+ task_names = [t.name for t in spec.tasks]
57
+ progress, task_ids = make_progress(task_names, k)
58
+
59
+ attempt_counts: dict[str, int] = {t.name: 0 for t in spec.tasks}
60
+ pass_counts: dict[str, int] = {t.name: 0 for t in spec.tasks}
61
+
62
+ def on_attempt_done(task_id: str, attempt: int, passed: bool, cheated: bool) -> None:
63
+ task = next((t for t in spec.tasks if t.id == task_id), None)
64
+ if task is None:
65
+ return
66
+ attempt_counts[task.name] += 1
67
+ if passed:
68
+ pass_counts[task.name] += 1
69
+ update_progress(
70
+ progress,
71
+ task_ids,
72
+ task.name,
73
+ k,
74
+ attempt_counts[task.name],
75
+ pass_counts[task.name],
76
+ cheated=cheated,
77
+ )
78
+
79
+ runner = TaskRunner(
80
+ harness=harness,
81
+ judge=judge,
82
+ spec=spec,
83
+ spec_path=spec_file,
84
+ k=k,
85
+ workers=workers,
86
+ timeout=timeout,
87
+ baseline=baseline,
88
+ judge_strategy=judge_strategy,
89
+ on_attempt_done=on_attempt_done,
90
+ )
91
+
92
+ with progress:
93
+ try:
94
+ results = runner.run()
95
+ except HarnessConfigurationError as exc:
96
+ console.print(
97
+ Panel(
98
+ str(exc),
99
+ title="[bold red]Backend configuration error[/bold red]",
100
+ border_style="red",
101
+ )
102
+ )
103
+ raise typer.Exit(2)
104
+
105
+ saved_path = save_results(results, str(spec_file))
106
+ if output:
107
+ Path(output).write_text(results.model_dump_json(indent=2))
108
+
109
+ print_results(results, verbose=verbose)
110
+ console.print(f"[dim]Results saved to {saved_path}[/dim]")
@@ -0,0 +1,66 @@
1
+ from pathlib import Path
2
+
3
+ import typer
4
+ from pydantic import ValidationError
5
+ from rich.console import Console
6
+ from rich.panel import Panel
7
+
8
+ from caliper.schema.spec import load_spec, spec_name
9
+
10
+ console = Console()
11
+
12
+
13
+ def _supports_unicode() -> bool:
14
+ encoding = getattr(console.file, "encoding", None) or ""
15
+ return "utf" in encoding.lower()
16
+
17
+
18
+ CHECK = "✓" if _supports_unicode() else "OK"
19
+ ARROW = "→" if _supports_unicode() else "->"
20
+
21
+
22
+ def validate_cmd(
23
+ spec_file: Path = typer.Argument(..., help="Path to .eval.yaml spec file"),
24
+ ) -> None:
25
+ if not spec_file.exists():
26
+ console.print(f"[bold red]Error:[/bold red] File not found: {spec_file}")
27
+ raise typer.Exit(1)
28
+
29
+ try:
30
+ spec = load_spec(spec_file)
31
+ except ValidationError as exc:
32
+ console.print(
33
+ Panel(
34
+ _format_validation_errors(exc),
35
+ title="[bold red]Validation failed[/bold red]",
36
+ border_style="red",
37
+ )
38
+ )
39
+ raise typer.Exit(1)
40
+ except Exception as exc:
41
+ console.print(f"[bold red]Error parsing YAML:[/bold red] {exc}")
42
+ raise typer.Exit(1)
43
+
44
+ name = spec_name(spec_file)
45
+ n_tasks = len(spec.tasks)
46
+ backend = spec.skill.backend
47
+
48
+ console.print(
49
+ Panel(
50
+ f"[bold]{name}[/bold]\n"
51
+ f" backend [cyan]{backend}[/cyan]\n"
52
+ f" tasks [cyan]{n_tasks}[/cyan]\n"
53
+ f" judge [cyan]{spec.judge.backend}[/cyan]"
54
+ + (f" / [dim]{spec.judge.model}[/dim]" if spec.judge.model else ""),
55
+ title=f"[bold green]{CHECK} Spec is valid[/bold green]",
56
+ border_style="green",
57
+ )
58
+ )
59
+
60
+
61
+ def _format_validation_errors(exc: ValidationError) -> str:
62
+ lines = []
63
+ for err in exc.errors():
64
+ loc = f" {ARROW} ".join(str(p) for p in err["loc"])
65
+ lines.append(f" [dim]{loc}[/dim] {err['msg']}")
66
+ return "\n".join(lines)
@@ -0,0 +1,31 @@
1
+ from caliper.harness.base import AttemptResult, ConversationTurn, HarnessBackend
2
+ from caliper.harness.claude_code import ClaudeCodeHarness
3
+ from caliper.harness.claude_api import ClaudeAPIHarness
4
+ from caliper.harness.openai_api import OpenAIAPIHarness
5
+ from caliper.schema.spec import normalize_backend
6
+
7
+
8
+ def get_harness(backend: str, model: str | None = None) -> HarnessBackend:
9
+ match normalize_backend(backend):
10
+ case "claude-code":
11
+ return ClaudeCodeHarness(model=model)
12
+ case "codex":
13
+ from caliper.harness.codex import CodexHarness
14
+ return CodexHarness(model=model)
15
+ case "claude-api":
16
+ return ClaudeAPIHarness(model=model)
17
+ case "openai-api":
18
+ return OpenAIAPIHarness(model=model)
19
+ case _:
20
+ raise ValueError(f"Unknown backend: {backend!r}")
21
+
22
+
23
+ __all__ = [
24
+ "AttemptResult",
25
+ "ConversationTurn",
26
+ "HarnessBackend",
27
+ "ClaudeCodeHarness",
28
+ "ClaudeAPIHarness",
29
+ "OpenAIAPIHarness",
30
+ "get_harness",
31
+ ]
@@ -0,0 +1,50 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+ from dataclasses import dataclass, field
5
+
6
+
7
+ class HarnessConfigurationError(RuntimeError):
8
+ """Raised when a harness cannot run because local configuration is invalid."""
9
+
10
+
11
+ @dataclass
12
+ class ConversationTurn:
13
+ role: str
14
+ content: str
15
+ tool_name: str | None = None
16
+ tool_input: dict | None = None
17
+ tool_output: str | None = None
18
+
19
+
20
+ @dataclass
21
+ class AttemptResult:
22
+ task_id: str
23
+ attempt: int
24
+ transcript: list[ConversationTurn]
25
+ final_output: str
26
+ exit_code: int
27
+ duration_seconds: float
28
+ error: str | None = None
29
+ cheated: bool = False
30
+ cheat_evidence: list[str] = field(default_factory=list)
31
+
32
+
33
+ class HarnessBackend(ABC):
34
+ @abstractmethod
35
+ def run(
36
+ self,
37
+ task_id: str,
38
+ attempt: int,
39
+ prompt: str,
40
+ *,
41
+ skill_path: str | None,
42
+ model: str | None,
43
+ timeout: int,
44
+ isolated_home: str,
45
+ extra_path: list[str] | None = None,
46
+ ) -> AttemptResult: ...
47
+
48
+ @property
49
+ @abstractmethod
50
+ def name(self) -> str: ...
@@ -0,0 +1,74 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ import time
5
+ from pathlib import Path
6
+
7
+ from caliper.harness.base import AttemptResult, ConversationTurn, HarnessBackend
8
+
9
+
10
+ class ClaudeAPIHarness(HarnessBackend):
11
+ def __init__(self, model: str | None = None) -> None:
12
+ self._model = model
13
+
14
+ @property
15
+ def name(self) -> str:
16
+ return "claude-api"
17
+
18
+ def run(
19
+ self,
20
+ task_id: str,
21
+ attempt: int,
22
+ prompt: str,
23
+ *,
24
+ skill_path: str | None,
25
+ model: str | None,
26
+ timeout: int,
27
+ isolated_home: str,
28
+ extra_path: list[str] | None = None,
29
+ ) -> AttemptResult:
30
+ full_prompt = self._inject_skill(prompt, skill_path)
31
+ effective_model = model or self._model or "claude-haiku-4-5-20251001"
32
+ start = time.monotonic()
33
+ output, exit_code, error = self._run_api(full_prompt, effective_model, timeout)
34
+ duration = time.monotonic() - start
35
+ transcript = [ConversationTurn(role="assistant", content=output)] if output else []
36
+ return AttemptResult(
37
+ task_id=task_id,
38
+ attempt=attempt,
39
+ transcript=transcript,
40
+ final_output=output,
41
+ exit_code=exit_code,
42
+ duration_seconds=duration,
43
+ error=error,
44
+ )
45
+
46
+ def _inject_skill(self, prompt: str, skill_path: str | None) -> str:
47
+ if not skill_path:
48
+ return prompt
49
+ skill_src = Path(skill_path).expanduser()
50
+ if not skill_src.exists():
51
+ return prompt
52
+ raw = skill_src.read_text()
53
+ body = re.sub(r"^---\n.*?\n---\n", "", raw, flags=re.DOTALL).strip()
54
+ return f"[Skill context]\n{body}\n[End skill context]\n\n{prompt}"
55
+
56
+ def _run_api(self, prompt: str, model: str, timeout: int) -> tuple[str, int, str | None]:
57
+ try:
58
+ import anthropic
59
+ except ImportError:
60
+ return "", 1, "anthropic package not installed; run: pip install caliper"
61
+
62
+ try:
63
+ client = anthropic.Anthropic(timeout=timeout)
64
+ response = client.messages.create(
65
+ model=model,
66
+ max_tokens=4096,
67
+ messages=[{"role": "user", "content": prompt}],
68
+ )
69
+ content = "".join(
70
+ block.text for block in response.content if getattr(block, "type", None) == "text"
71
+ )
72
+ return content.strip(), 0, None
73
+ except Exception as exc:
74
+ return "", 1, str(exc)