loopengt 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. loopengt/__init__.py +31 -0
  2. loopengt/adapters/__init__.py +1 -0
  3. loopengt/adapters/antigravity/__init__.py +1 -0
  4. loopengt/adapters/antigravity/adapter.py +55 -0
  5. loopengt/adapters/antigravity/commands.py +21 -0
  6. loopengt/adapters/base.py +51 -0
  7. loopengt/adapters/claude_code/__init__.py +1 -0
  8. loopengt/adapters/claude_code/adapter.py +55 -0
  9. loopengt/adapters/claude_code/commands.py +16 -0
  10. loopengt/adapters/codex/__init__.py +1 -0
  11. loopengt/adapters/codex/adapter.py +52 -0
  12. loopengt/adapters/codex/commands.py +16 -0
  13. loopengt/adapters/cursor/__init__.py +1 -0
  14. loopengt/adapters/cursor/adapter.py +56 -0
  15. loopengt/adapters/cursor/commands.py +29 -0
  16. loopengt/adapters/generic/__init__.py +1 -0
  17. loopengt/adapters/generic/terminal.py +82 -0
  18. loopengt/cli/__init__.py +1 -0
  19. loopengt/cli/commands/__init__.py +1 -0
  20. loopengt/cli/commands/design.py +171 -0
  21. loopengt/cli/commands/doctor.py +110 -0
  22. loopengt/cli/commands/eval.py +105 -0
  23. loopengt/cli/commands/init.py +131 -0
  24. loopengt/cli/commands/mcp_serve.py +57 -0
  25. loopengt/cli/commands/run.py +99 -0
  26. loopengt/cli/commands/template.py +145 -0
  27. loopengt/cli/commands/trace.py +114 -0
  28. loopengt/cli/formatters.py +125 -0
  29. loopengt/cli/main.py +66 -0
  30. loopengt/core/__init__.py +1 -0
  31. loopengt/core/evals/__init__.py +1 -0
  32. loopengt/core/evals/judges.py +216 -0
  33. loopengt/core/evals/metrics.py +119 -0
  34. loopengt/core/evals/regression.py +157 -0
  35. loopengt/core/memory/__init__.py +1 -0
  36. loopengt/core/memory/retrieval.py +124 -0
  37. loopengt/core/memory/store.py +184 -0
  38. loopengt/core/memory/summarizer.py +97 -0
  39. loopengt/core/models/__init__.py +43 -0
  40. loopengt/core/models/agent.py +126 -0
  41. loopengt/core/models/loop_spec.py +251 -0
  42. loopengt/core/models/policy.py +131 -0
  43. loopengt/core/models/state.py +271 -0
  44. loopengt/core/models/tool.py +105 -0
  45. loopengt/core/runtime/__init__.py +1 -0
  46. loopengt/core/runtime/checkpoint.py +152 -0
  47. loopengt/core/runtime/executor.py +463 -0
  48. loopengt/core/runtime/handoff.py +139 -0
  49. loopengt/core/runtime/scheduler.py +168 -0
  50. loopengt/core/tracing/__init__.py +1 -0
  51. loopengt/core/tracing/events.py +95 -0
  52. loopengt/core/tracing/exporters.py +158 -0
  53. loopengt/core/tracing/store.py +202 -0
  54. loopengt/mcp/__init__.py +1 -0
  55. loopengt/mcp/client/__init__.py +1 -0
  56. loopengt/mcp/client/manager.py +118 -0
  57. loopengt/mcp/client/tools.py +107 -0
  58. loopengt/mcp/server/__init__.py +1 -0
  59. loopengt/mcp/server/prompts.py +82 -0
  60. loopengt/mcp/server/resources.py +75 -0
  61. loopengt/mcp/server/server.py +50 -0
  62. loopengt/mcp/server/tools.py +214 -0
  63. loopengt/mcp/shared/__init__.py +1 -0
  64. loopengt/mcp/shared/schemas.py +91 -0
  65. loopengt/plugins/__init__.py +1 -0
  66. loopengt/plugins/base.py +90 -0
  67. loopengt/plugins/loader.py +130 -0
  68. loopengt/plugins/manifest.py +70 -0
  69. loopengt/plugins/registry.py +146 -0
  70. loopengt/prompts/LOOPENGT.md +60 -0
  71. loopengt/prompts/__init__.py +1 -0
  72. loopengt/storage/__init__.py +1 -0
  73. loopengt/storage/jsonl.py +84 -0
  74. loopengt/storage/sqlite.py +102 -0
  75. loopengt/templates/__init__.py +1 -0
  76. loopengt/templates/builtins/handoff_loop/LOOPENGS.md +10 -0
  77. loopengt/templates/builtins/planner_executor/LOOPENGS.md +29 -0
  78. loopengt/templates/builtins/research_architect/LOOPENGS.md +17 -0
  79. loopengt/templates/builtins/reviewer_retry/LOOPENGS.md +29 -0
  80. loopengt/templates/builtins/supervisor_workers/LOOPENGS.md +29 -0
  81. loopengt/templates/loader.py +38 -0
  82. loopengt/templates/registry.py +85 -0
  83. loopengt-0.1.0.dist-info/METADATA +275 -0
  84. loopengt-0.1.0.dist-info/RECORD +87 -0
  85. loopengt-0.1.0.dist-info/WHEEL +4 -0
  86. loopengt-0.1.0.dist-info/entry_points.txt +8 -0
  87. loopengt-0.1.0.dist-info/licenses/LICENSE +674 -0
@@ -0,0 +1,114 @@
1
+ """``loopengt trace`` — inspect the execution trace of a run."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+
8
+ import typer
9
+
10
+ from loopengt.cli.formatters import (
11
+ console,
12
+ print_error,
13
+ print_info,
14
+ print_run_trace_tree,
15
+ )
16
+
17
+
18
+ def trace_cmd(
19
+ run_id: str = typer.Argument(..., help="Run ID to inspect."),
20
+ runs_dir: Path = typer.Option(
21
+ Path(".loopengt/runs"),
22
+ "--runs-dir",
23
+ help="Directory containing run traces.",
24
+ ),
25
+ output_format: str = typer.Option(
26
+ "tree",
27
+ "--format",
28
+ "-f",
29
+ help="Output format: tree, json, markdown.",
30
+ ),
31
+ ) -> None:
32
+ """Show step-by-step execution trace for a completed run."""
33
+ trace_path = runs_dir / f"{run_id}.json"
34
+
35
+ if not trace_path.exists():
36
+ # Try to find a partial match
37
+ candidates = list(runs_dir.glob(f"{run_id}*"))
38
+ if candidates:
39
+ trace_path = candidates[0]
40
+ else:
41
+ print_error(f"Trace not found: {trace_path}")
42
+ print_info(f"Available runs in {runs_dir}:")
43
+ if runs_dir.exists():
44
+ for f in sorted(runs_dir.glob("*.json")):
45
+ print_info(f" {f.stem}")
46
+ raise typer.Exit(code=1)
47
+
48
+ try:
49
+ trace_data = json.loads(trace_path.read_text(encoding="utf-8"))
50
+ except (json.JSONDecodeError, OSError) as exc:
51
+ print_error(f"Failed to read trace: {exc}")
52
+ raise typer.Exit(code=1) from exc
53
+
54
+ if output_format == "json":
55
+ console.print_json(json.dumps(trace_data, indent=2, default=str))
56
+ elif output_format == "markdown":
57
+ _print_markdown_trace(trace_data)
58
+ else:
59
+ _print_tree_trace(trace_data)
60
+
61
+
62
+ def _print_tree_trace(trace_data: dict) -> None:
63
+ """Render trace as a Rich tree."""
64
+ # Reshape data for the tree formatter
65
+ steps = []
66
+ for step_name, step_state in trace_data.get("step_states", {}).items():
67
+ steps.append(
68
+ {
69
+ "step_name": step_name,
70
+ "status": step_state.get("status", "unknown"),
71
+ "duration_seconds": (
72
+ step_state.get("result", {}) or {}
73
+ ).get("duration_seconds", 0),
74
+ "error": (step_state.get("result", {}) or {}).get("error"),
75
+ }
76
+ )
77
+
78
+ print_run_trace_tree(
79
+ {
80
+ "run_id": trace_data.get("run_id", "unknown"),
81
+ "status": trace_data.get("status", "unknown"),
82
+ "steps": steps,
83
+ }
84
+ )
85
+
86
+
87
+ def _print_markdown_trace(trace_data: dict) -> None:
88
+ """Render trace as markdown text."""
89
+ lines = [
90
+ f"# Run Trace: {trace_data.get('run_id', 'unknown')}",
91
+ f"",
92
+ f"**Status**: {trace_data.get('status', 'unknown')}",
93
+ f"**Loop**: {trace_data.get('loop_name', 'unknown')}",
94
+ f"**Turns**: {trace_data.get('turn', 0)}",
95
+ f"",
96
+ f"## Steps",
97
+ f"",
98
+ ]
99
+
100
+ for step_name, step_state in trace_data.get("step_states", {}).items():
101
+ status = step_state.get("status", "unknown")
102
+ attempts = step_state.get("attempts", 0)
103
+ lines.append(f"### {step_name}")
104
+ lines.append(f"- Status: {status}")
105
+ lines.append(f"- Attempts: {attempts}")
106
+ result = step_state.get("result")
107
+ if result:
108
+ if result.get("error"):
109
+ lines.append(f"- Error: {result['error']}")
110
+ if result.get("duration_seconds"):
111
+ lines.append(f"- Duration: {result['duration_seconds']:.2f}s")
112
+ lines.append("")
113
+
114
+ console.print("\n".join(lines))
@@ -0,0 +1,125 @@
1
+ """Rich formatters for CLI output."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from rich.console import Console
8
+ from rich.panel import Panel
9
+ from rich.table import Table
10
+ from rich.text import Text
11
+ from rich.tree import Tree
12
+
13
+ console = Console()
14
+ error_console = Console(stderr=True)
15
+
16
+
17
+ def print_success(message: str) -> None:
18
+ """Print a success message with a green checkmark."""
19
+ console.print(f"[bold green]✓[/bold green] {message}")
20
+
21
+
22
+ def print_error(message: str) -> None:
23
+ """Print an error message with a red cross."""
24
+ error_console.print(f"[bold red]✗[/bold red] {message}")
25
+
26
+
27
+ def print_warning(message: str) -> None:
28
+ """Print a warning message with a yellow exclamation."""
29
+ console.print(f"[bold yellow]⚠[/bold yellow] {message}")
30
+
31
+
32
+ def print_info(message: str) -> None:
33
+ """Print an info message with a blue arrow."""
34
+ console.print(f"[bold blue]→[/bold blue] {message}")
35
+
36
+
37
+ def print_header(title: str, subtitle: str = "") -> None:
38
+ """Print a styled header panel."""
39
+ content = f"[bold]{title}[/bold]"
40
+ if subtitle:
41
+ content += f"\n[dim]{subtitle}[/dim]"
42
+ console.print(Panel(content, border_style="blue", padding=(1, 2)))
43
+
44
+
45
+ def print_loop_spec_table(spec: dict[str, Any]) -> None:
46
+ """Print a loop spec summary as a Rich table."""
47
+ table = Table(title="Loop Specification", show_header=True, border_style="blue")
48
+ table.add_column("Field", style="cyan", width=20)
49
+ table.add_column("Value", style="white")
50
+
51
+ for key in ("name", "version", "goal", "pattern"):
52
+ if key in spec:
53
+ table.add_row(key, str(spec[key]))
54
+
55
+ if "agents" in spec:
56
+ agents = ", ".join(
57
+ a["name"] if isinstance(a, dict) else str(a)
58
+ for a in spec["agents"]
59
+ )
60
+ table.add_row("agents", agents)
61
+
62
+ if "steps" in spec:
63
+ steps = ", ".join(
64
+ s["name"] if isinstance(s, dict) else str(s)
65
+ for s in spec["steps"]
66
+ )
67
+ table.add_row("steps", steps)
68
+
69
+ console.print(table)
70
+
71
+
72
+ def print_run_trace_tree(trace: dict[str, Any]) -> None:
73
+ """Print an execution trace as a Rich tree."""
74
+ run_id = trace.get("run_id", "unknown")
75
+ status = trace.get("status", "unknown")
76
+
77
+ status_color = {
78
+ "completed": "green",
79
+ "failed": "red",
80
+ "running": "yellow",
81
+ "pending": "dim",
82
+ "cancelled": "red",
83
+ }.get(status, "white")
84
+
85
+ tree = Tree(
86
+ Text.from_markup(
87
+ f"[bold]Run:[/bold] {run_id} "
88
+ f"[{status_color}]{status}[/{status_color}]"
89
+ )
90
+ )
91
+
92
+ for step in trace.get("steps", []):
93
+ step_name = step.get("step_name", "unknown")
94
+ step_status = step.get("status", "unknown")
95
+ duration = step.get("duration_seconds", 0)
96
+
97
+ s_color = {
98
+ "completed": "green",
99
+ "failed": "red",
100
+ "running": "yellow",
101
+ "skipped": "dim",
102
+ }.get(step_status, "white")
103
+
104
+ step_node = tree.add(
105
+ Text.from_markup(
106
+ f"[{s_color}]●[/{s_color}] {step_name} "
107
+ f"[dim]{duration:.2f}s[/dim]"
108
+ )
109
+ )
110
+
111
+ if step.get("error"):
112
+ step_node.add(Text(f"Error: {step['error']}", style="red"))
113
+
114
+ console.print(tree)
115
+
116
+
117
+ def print_doctor_result(
118
+ name: str, passed: bool, detail: str = ""
119
+ ) -> None:
120
+ """Print a single diagnostic check result."""
121
+ icon = "[bold green]✓[/bold green]" if passed else "[bold red]✗[/bold red]"
122
+ msg = f"{icon} {name}"
123
+ if detail:
124
+ msg += f" [dim]{detail}[/dim]"
125
+ console.print(msg)
loopengt/cli/main.py ADDED
@@ -0,0 +1,66 @@
1
+ """loopengt CLI — the main Typer application entry point."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import typer
6
+
7
+ from loopengt import __version__
8
+
9
+ app = typer.Typer(
10
+ name="loopengt",
11
+ help="Loop Engineering Agent — design, orchestrate, and evaluate agent loops.",
12
+ no_args_is_help=True,
13
+ rich_markup_mode="rich",
14
+ add_completion=True,
15
+ )
16
+
17
+
18
+ def version_callback(value: bool) -> None:
19
+ """Print version and exit."""
20
+ if value:
21
+ typer.echo(f"loopengt {__version__}")
22
+ raise typer.Exit()
23
+
24
+
25
+ @app.callback()
26
+ def main(
27
+ version: bool = typer.Option(
28
+ False,
29
+ "--version",
30
+ "-v",
31
+ help="Show version and exit.",
32
+ callback=version_callback,
33
+ is_eager=True,
34
+ ),
35
+ ) -> None:
36
+ """Loop Engineering Agent CLI."""
37
+
38
+
39
+ # ── Register sub-commands ──────────────────────────────────────────────
40
+
41
+ def _register_commands() -> None:
42
+ """Import and register all CLI command groups."""
43
+ from loopengt.cli.commands.design import design_cmd
44
+ from loopengt.cli.commands.doctor import doctor_cmd
45
+ from loopengt.cli.commands.eval import eval_cmd
46
+ from loopengt.cli.commands.init import init_cmd
47
+ from loopengt.cli.commands.mcp_serve import mcp_cmd
48
+ from loopengt.cli.commands.run import run_cmd
49
+ from loopengt.cli.commands.template import template_app
50
+ from loopengt.cli.commands.trace import trace_cmd
51
+
52
+ app.command("init", help="Scaffold a new .loopengt/ project directory.")(init_cmd)
53
+ app.command("design", help="Design a loop from a natural-language goal.")(design_cmd)
54
+ app.command("run", help="Execute a loop from a YAML spec.")(run_cmd)
55
+ app.command("trace", help="Inspect the execution trace of a run.")(trace_cmd)
56
+ app.command("eval", help="Run evaluations against a completed run.")(eval_cmd)
57
+ app.command("doctor", help="Diagnose configuration and dependencies.")(doctor_cmd)
58
+ app.command("mcp", help="Start the MCP server.")(mcp_cmd)
59
+ app.add_typer(template_app, name="template", help="Manage loop templates.")
60
+
61
+
62
+ _register_commands()
63
+
64
+
65
+ if __name__ == "__main__":
66
+ app()
@@ -0,0 +1 @@
1
+ """Core engine for loop design, execution, and evaluation."""
@@ -0,0 +1 @@
1
+ """Evaluation subsystem."""
@@ -0,0 +1,216 @@
1
+ """LLM-as-judge evaluators."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ import structlog
8
+
9
+ logger = structlog.get_logger(__name__)
10
+
11
+
12
+ class JudgeResult:
13
+ """Result from an LLM judge evaluation."""
14
+
15
+ __slots__ = ("judge_name", "score", "passed", "reasoning", "raw_output")
16
+
17
+ def __init__(
18
+ self,
19
+ judge_name: str,
20
+ score: float,
21
+ passed: bool,
22
+ reasoning: str = "",
23
+ raw_output: str = "",
24
+ ) -> None:
25
+ self.judge_name = judge_name
26
+ self.score = score
27
+ self.passed = passed
28
+ self.reasoning = reasoning
29
+ self.raw_output = raw_output
30
+
31
+ def to_dict(self) -> dict[str, Any]:
32
+ return {
33
+ "judge_name": self.judge_name,
34
+ "score": self.score,
35
+ "passed": self.passed,
36
+ "reasoning": self.reasoning,
37
+ }
38
+
39
+
40
+ class LLMJudge:
41
+ """Base class for LLM-as-judge evaluators.
42
+
43
+ Subclass and implement ``_build_prompt`` and ``_parse_response``
44
+ to create custom judges.
45
+
46
+ Usage::
47
+
48
+ judge = GoalAchievementJudge()
49
+ result = await judge.evaluate(trace_data, goal="build an API")
50
+ """
51
+
52
+ def __init__(self, name: str = "llm_judge") -> None:
53
+ self._name = name
54
+ self._log = logger.bind(judge=name)
55
+
56
+ async def evaluate(
57
+ self, trace: dict[str, Any], **kwargs: Any
58
+ ) -> JudgeResult:
59
+ """Evaluate a loop run trace.
60
+
61
+ Returns a ``JudgeResult`` with score and reasoning.
62
+ In stub mode (no LLM configured), returns a default result.
63
+ """
64
+ prompt = self._build_prompt(trace, **kwargs)
65
+
66
+ try:
67
+ response = await self._call_llm(prompt)
68
+ return self._parse_response(response)
69
+ except Exception as exc: # noqa: BLE001
70
+ self._log.warning("judge.error", error=str(exc))
71
+ return JudgeResult(
72
+ judge_name=self._name,
73
+ score=0.0,
74
+ passed=False,
75
+ reasoning=f"Judge failed: {exc}",
76
+ )
77
+
78
+ def _build_prompt(self, trace: dict[str, Any], **kwargs: Any) -> str:
79
+ """Build the evaluation prompt. Override in subclasses."""
80
+ return (
81
+ f"Evaluate this loop execution trace:\n"
82
+ f"{trace}\n\n"
83
+ f"Score from 0.0 to 1.0 and explain your reasoning."
84
+ )
85
+
86
+ def _parse_response(self, response: str) -> JudgeResult:
87
+ """Parse the LLM response. Override in subclasses."""
88
+ return JudgeResult(
89
+ judge_name=self._name,
90
+ score=0.5,
91
+ passed=True,
92
+ reasoning=response,
93
+ raw_output=response,
94
+ )
95
+
96
+ async def _call_llm(self, prompt: str) -> str:
97
+ """Call the LLM to evaluate the trace."""
98
+ try:
99
+ import os
100
+ from openai import AsyncOpenAI
101
+ except ImportError:
102
+ self._log.warning("openai_not_installed", fallback="stub")
103
+ return '{"score": 0.5, "reasoning": "[stub] Install loopengt[llm] for real evaluation"}'
104
+
105
+ provider = os.environ.get("LOOPENGT_LLM_PROVIDER", "openai").lower()
106
+ if provider == "huggingface":
107
+ api_key = os.environ.get("HF_TOKEN")
108
+ base_url = "https://api-inference.huggingface.co/v1/"
109
+ else:
110
+ api_key = os.environ.get("OPENAI_API_KEY")
111
+ base_url = None
112
+
113
+ if not api_key:
114
+ self._log.warning("no_api_key", fallback="stub")
115
+ return '{"score": 0.5, "reasoning": "[stub] Missing API key"}'
116
+
117
+ client = AsyncOpenAI(api_key=api_key, base_url=base_url)
118
+ model = os.environ.get("LOOPENGT_EVAL_MODEL", "gpt-4o")
119
+
120
+ try:
121
+ response = await client.chat.completions.create(
122
+ model=model,
123
+ messages=[
124
+ {"role": "system", "content": "You are an objective AI evaluator. Return JSON format."},
125
+ {"role": "user", "content": prompt}
126
+ ]
127
+ )
128
+ return response.choices[0].message.content or ""
129
+ except Exception as e:
130
+ self._log.error("llm_eval_failed", error=str(e))
131
+ return f'{{"score": 0.0, "reasoning": "LLM call failed: {e}"}}'
132
+
133
+
134
+ class GoalAchievementJudge(LLMJudge):
135
+ """Judge that evaluates whether the loop achieved its stated goal."""
136
+
137
+ def __init__(self) -> None:
138
+ super().__init__(name="goal_achievement")
139
+
140
+ def _build_prompt(self, trace: dict[str, Any], **kwargs: Any) -> str:
141
+ goal = kwargs.get("goal", trace.get("context", {}).get("goal", "unknown"))
142
+ status = trace.get("status", "unknown")
143
+ history = trace.get("history", [])
144
+
145
+ return (
146
+ f"## Goal\n{goal}\n\n"
147
+ f"## Run Status\n{status}\n\n"
148
+ f"## Execution History\n{history}\n\n"
149
+ f"## Instructions\n"
150
+ f"Score this run from 0.0 to 1.0 on goal achievement.\n"
151
+ f"Respond with JSON: {{\"score\": float, \"reasoning\": \"...\"}}"
152
+ )
153
+
154
+ def _parse_response(self, response: str) -> JudgeResult:
155
+ import json
156
+ score = 0.0
157
+ reasoning = "Failed to parse JSON"
158
+ passed = False
159
+ try:
160
+ content = response.strip()
161
+ if content.startswith("```json"):
162
+ content = content[7:-3].strip()
163
+ data = json.loads(content)
164
+ score = float(data.get("score", 0.0))
165
+ reasoning = data.get("reasoning", "")
166
+ passed = score >= 0.8
167
+ except (json.JSONDecodeError, ValueError):
168
+ reasoning = response
169
+
170
+ return JudgeResult(
171
+ judge_name=self._name,
172
+ score=score,
173
+ passed=passed,
174
+ reasoning=reasoning,
175
+ raw_output=response,
176
+ )
177
+
178
+
179
+ class QualityJudge(LLMJudge):
180
+ """Judge that evaluates the overall quality of loop outputs."""
181
+
182
+ def __init__(self) -> None:
183
+ super().__init__(name="quality")
184
+
185
+ def _build_prompt(self, trace: dict[str, Any], **kwargs: Any) -> str:
186
+ return (
187
+ f"Evaluate the quality of this loop's outputs:\n"
188
+ f"{trace.get('history', [])}\n\n"
189
+ f"Consider: completeness, correctness, clarity, efficiency.\n"
190
+ f"Score from 0.0 to 1.0 with reasoning.\n"
191
+ f"Respond with JSON: {{\"score\": float, \"reasoning\": \"...\"}}"
192
+ )
193
+
194
+ def _parse_response(self, response: str) -> JudgeResult:
195
+ import json
196
+ score = 0.0
197
+ reasoning = "Failed to parse JSON"
198
+ passed = False
199
+ try:
200
+ content = response.strip()
201
+ if content.startswith("```json"):
202
+ content = content[7:-3].strip()
203
+ data = json.loads(content)
204
+ score = float(data.get("score", 0.0))
205
+ reasoning = data.get("reasoning", "")
206
+ passed = score >= 0.8
207
+ except (json.JSONDecodeError, ValueError):
208
+ reasoning = response
209
+
210
+ return JudgeResult(
211
+ judge_name=self._name,
212
+ score=score,
213
+ passed=passed,
214
+ reasoning=reasoning,
215
+ raw_output=response,
216
+ )
@@ -0,0 +1,119 @@
1
+ """Built-in evaluation metrics for loop runs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ import structlog
8
+
9
+ logger = structlog.get_logger(__name__)
10
+
11
+
12
+ class MetricResult:
13
+ """Result of a single metric evaluation."""
14
+
15
+ __slots__ = ("name", "value", "passed", "detail")
16
+
17
+ def __init__(
18
+ self, name: str, value: Any, passed: bool, detail: str = ""
19
+ ) -> None:
20
+ self.name = name
21
+ self.value = value
22
+ self.passed = passed
23
+ self.detail = detail
24
+
25
+ def to_dict(self) -> dict[str, Any]:
26
+ return {
27
+ "name": self.name,
28
+ "value": self.value,
29
+ "passed": self.passed,
30
+ "detail": self.detail,
31
+ }
32
+
33
+
34
+ def total_turns(trace: dict[str, Any]) -> MetricResult:
35
+ """Count total turns executed."""
36
+ turns = trace.get("turn", 0)
37
+ return MetricResult("total_turns", turns, True)
38
+
39
+
40
+ def total_steps(trace: dict[str, Any]) -> MetricResult:
41
+ """Count total steps defined."""
42
+ count = len(trace.get("step_states", {}))
43
+ return MetricResult("total_steps", count, True)
44
+
45
+
46
+ def completed_steps(trace: dict[str, Any]) -> MetricResult:
47
+ """Count successfully completed steps."""
48
+ count = sum(
49
+ 1
50
+ for s in trace.get("step_states", {}).values()
51
+ if s.get("status") == "completed"
52
+ )
53
+ return MetricResult("completed_steps", count, True)
54
+
55
+
56
+ def failed_steps(trace: dict[str, Any]) -> MetricResult:
57
+ """Count failed steps."""
58
+ count = sum(
59
+ 1
60
+ for s in trace.get("step_states", {}).values()
61
+ if s.get("status") == "failed"
62
+ )
63
+ return MetricResult("failed_steps", count, count == 0)
64
+
65
+
66
+ def success_rate(trace: dict[str, Any]) -> MetricResult:
67
+ """Calculate step success rate."""
68
+ total = len(trace.get("step_states", {}))
69
+ if total == 0:
70
+ return MetricResult("success_rate", 0.0, False, "No steps")
71
+
72
+ completed = sum(
73
+ 1
74
+ for s in trace.get("step_states", {}).values()
75
+ if s.get("status") == "completed"
76
+ )
77
+ rate = completed / total
78
+ return MetricResult("success_rate", round(rate, 4), rate >= 0.5)
79
+
80
+
81
+ def total_duration(trace: dict[str, Any]) -> MetricResult:
82
+ """Sum of all step durations."""
83
+ duration = sum(
84
+ (s.get("result") or {}).get("duration_seconds", 0)
85
+ for s in trace.get("step_states", {}).values()
86
+ )
87
+ return MetricResult("total_duration_seconds", round(duration, 2), True)
88
+
89
+
90
+ def run_status(trace: dict[str, Any]) -> MetricResult:
91
+ """Check overall run status."""
92
+ status = trace.get("status", "unknown")
93
+ return MetricResult("run_status", status, status == "completed")
94
+
95
+
96
+ # Registry of all built-in metrics
97
+ BUILTIN_METRICS = {
98
+ "total_turns": total_turns,
99
+ "total_steps": total_steps,
100
+ "completed_steps": completed_steps,
101
+ "failed_steps": failed_steps,
102
+ "success_rate": success_rate,
103
+ "total_duration": total_duration,
104
+ "run_status": run_status,
105
+ }
106
+
107
+
108
+ def evaluate_all(
109
+ trace: dict[str, Any],
110
+ metric_names: list[str] | None = None,
111
+ ) -> list[MetricResult]:
112
+ """Run all (or selected) metrics against a trace."""
113
+ selected = metric_names or list(BUILTIN_METRICS.keys())
114
+ results = []
115
+ for name in selected:
116
+ fn = BUILTIN_METRICS.get(name)
117
+ if fn:
118
+ results.append(fn(trace))
119
+ return results