loopengt 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loopengt/__init__.py +31 -0
- loopengt/adapters/__init__.py +1 -0
- loopengt/adapters/antigravity/__init__.py +1 -0
- loopengt/adapters/antigravity/adapter.py +55 -0
- loopengt/adapters/antigravity/commands.py +21 -0
- loopengt/adapters/base.py +51 -0
- loopengt/adapters/claude_code/__init__.py +1 -0
- loopengt/adapters/claude_code/adapter.py +55 -0
- loopengt/adapters/claude_code/commands.py +16 -0
- loopengt/adapters/codex/__init__.py +1 -0
- loopengt/adapters/codex/adapter.py +52 -0
- loopengt/adapters/codex/commands.py +16 -0
- loopengt/adapters/cursor/__init__.py +1 -0
- loopengt/adapters/cursor/adapter.py +56 -0
- loopengt/adapters/cursor/commands.py +29 -0
- loopengt/adapters/generic/__init__.py +1 -0
- loopengt/adapters/generic/terminal.py +82 -0
- loopengt/cli/__init__.py +1 -0
- loopengt/cli/commands/__init__.py +1 -0
- loopengt/cli/commands/design.py +171 -0
- loopengt/cli/commands/doctor.py +110 -0
- loopengt/cli/commands/eval.py +105 -0
- loopengt/cli/commands/init.py +131 -0
- loopengt/cli/commands/mcp_serve.py +57 -0
- loopengt/cli/commands/run.py +99 -0
- loopengt/cli/commands/template.py +145 -0
- loopengt/cli/commands/trace.py +114 -0
- loopengt/cli/formatters.py +125 -0
- loopengt/cli/main.py +66 -0
- loopengt/core/__init__.py +1 -0
- loopengt/core/evals/__init__.py +1 -0
- loopengt/core/evals/judges.py +216 -0
- loopengt/core/evals/metrics.py +119 -0
- loopengt/core/evals/regression.py +157 -0
- loopengt/core/memory/__init__.py +1 -0
- loopengt/core/memory/retrieval.py +124 -0
- loopengt/core/memory/store.py +184 -0
- loopengt/core/memory/summarizer.py +97 -0
- loopengt/core/models/__init__.py +43 -0
- loopengt/core/models/agent.py +126 -0
- loopengt/core/models/loop_spec.py +251 -0
- loopengt/core/models/policy.py +131 -0
- loopengt/core/models/state.py +271 -0
- loopengt/core/models/tool.py +105 -0
- loopengt/core/runtime/__init__.py +1 -0
- loopengt/core/runtime/checkpoint.py +152 -0
- loopengt/core/runtime/executor.py +463 -0
- loopengt/core/runtime/handoff.py +139 -0
- loopengt/core/runtime/scheduler.py +168 -0
- loopengt/core/tracing/__init__.py +1 -0
- loopengt/core/tracing/events.py +95 -0
- loopengt/core/tracing/exporters.py +158 -0
- loopengt/core/tracing/store.py +202 -0
- loopengt/mcp/__init__.py +1 -0
- loopengt/mcp/client/__init__.py +1 -0
- loopengt/mcp/client/manager.py +118 -0
- loopengt/mcp/client/tools.py +107 -0
- loopengt/mcp/server/__init__.py +1 -0
- loopengt/mcp/server/prompts.py +82 -0
- loopengt/mcp/server/resources.py +75 -0
- loopengt/mcp/server/server.py +50 -0
- loopengt/mcp/server/tools.py +214 -0
- loopengt/mcp/shared/__init__.py +1 -0
- loopengt/mcp/shared/schemas.py +91 -0
- loopengt/plugins/__init__.py +1 -0
- loopengt/plugins/base.py +90 -0
- loopengt/plugins/loader.py +130 -0
- loopengt/plugins/manifest.py +70 -0
- loopengt/plugins/registry.py +146 -0
- loopengt/prompts/LOOPENGT.md +60 -0
- loopengt/prompts/__init__.py +1 -0
- loopengt/storage/__init__.py +1 -0
- loopengt/storage/jsonl.py +84 -0
- loopengt/storage/sqlite.py +102 -0
- loopengt/templates/__init__.py +1 -0
- loopengt/templates/builtins/handoff_loop/LOOPENGS.md +10 -0
- loopengt/templates/builtins/planner_executor/LOOPENGS.md +29 -0
- loopengt/templates/builtins/research_architect/LOOPENGS.md +17 -0
- loopengt/templates/builtins/reviewer_retry/LOOPENGS.md +29 -0
- loopengt/templates/builtins/supervisor_workers/LOOPENGS.md +29 -0
- loopengt/templates/loader.py +38 -0
- loopengt/templates/registry.py +85 -0
- loopengt-0.1.0.dist-info/METADATA +275 -0
- loopengt-0.1.0.dist-info/RECORD +87 -0
- loopengt-0.1.0.dist-info/WHEEL +4 -0
- loopengt-0.1.0.dist-info/entry_points.txt +8 -0
- loopengt-0.1.0.dist-info/licenses/LICENSE +674 -0
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""``loopengt trace`` — inspect the execution trace of a run."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import typer
|
|
9
|
+
|
|
10
|
+
from loopengt.cli.formatters import (
|
|
11
|
+
console,
|
|
12
|
+
print_error,
|
|
13
|
+
print_info,
|
|
14
|
+
print_run_trace_tree,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def trace_cmd(
|
|
19
|
+
run_id: str = typer.Argument(..., help="Run ID to inspect."),
|
|
20
|
+
runs_dir: Path = typer.Option(
|
|
21
|
+
Path(".loopengt/runs"),
|
|
22
|
+
"--runs-dir",
|
|
23
|
+
help="Directory containing run traces.",
|
|
24
|
+
),
|
|
25
|
+
output_format: str = typer.Option(
|
|
26
|
+
"tree",
|
|
27
|
+
"--format",
|
|
28
|
+
"-f",
|
|
29
|
+
help="Output format: tree, json, markdown.",
|
|
30
|
+
),
|
|
31
|
+
) -> None:
|
|
32
|
+
"""Show step-by-step execution trace for a completed run."""
|
|
33
|
+
trace_path = runs_dir / f"{run_id}.json"
|
|
34
|
+
|
|
35
|
+
if not trace_path.exists():
|
|
36
|
+
# Try to find a partial match
|
|
37
|
+
candidates = list(runs_dir.glob(f"{run_id}*"))
|
|
38
|
+
if candidates:
|
|
39
|
+
trace_path = candidates[0]
|
|
40
|
+
else:
|
|
41
|
+
print_error(f"Trace not found: {trace_path}")
|
|
42
|
+
print_info(f"Available runs in {runs_dir}:")
|
|
43
|
+
if runs_dir.exists():
|
|
44
|
+
for f in sorted(runs_dir.glob("*.json")):
|
|
45
|
+
print_info(f" {f.stem}")
|
|
46
|
+
raise typer.Exit(code=1)
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
trace_data = json.loads(trace_path.read_text(encoding="utf-8"))
|
|
50
|
+
except (json.JSONDecodeError, OSError) as exc:
|
|
51
|
+
print_error(f"Failed to read trace: {exc}")
|
|
52
|
+
raise typer.Exit(code=1) from exc
|
|
53
|
+
|
|
54
|
+
if output_format == "json":
|
|
55
|
+
console.print_json(json.dumps(trace_data, indent=2, default=str))
|
|
56
|
+
elif output_format == "markdown":
|
|
57
|
+
_print_markdown_trace(trace_data)
|
|
58
|
+
else:
|
|
59
|
+
_print_tree_trace(trace_data)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _print_tree_trace(trace_data: dict) -> None:
|
|
63
|
+
"""Render trace as a Rich tree."""
|
|
64
|
+
# Reshape data for the tree formatter
|
|
65
|
+
steps = []
|
|
66
|
+
for step_name, step_state in trace_data.get("step_states", {}).items():
|
|
67
|
+
steps.append(
|
|
68
|
+
{
|
|
69
|
+
"step_name": step_name,
|
|
70
|
+
"status": step_state.get("status", "unknown"),
|
|
71
|
+
"duration_seconds": (
|
|
72
|
+
step_state.get("result", {}) or {}
|
|
73
|
+
).get("duration_seconds", 0),
|
|
74
|
+
"error": (step_state.get("result", {}) or {}).get("error"),
|
|
75
|
+
}
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
print_run_trace_tree(
|
|
79
|
+
{
|
|
80
|
+
"run_id": trace_data.get("run_id", "unknown"),
|
|
81
|
+
"status": trace_data.get("status", "unknown"),
|
|
82
|
+
"steps": steps,
|
|
83
|
+
}
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _print_markdown_trace(trace_data: dict) -> None:
|
|
88
|
+
"""Render trace as markdown text."""
|
|
89
|
+
lines = [
|
|
90
|
+
f"# Run Trace: {trace_data.get('run_id', 'unknown')}",
|
|
91
|
+
f"",
|
|
92
|
+
f"**Status**: {trace_data.get('status', 'unknown')}",
|
|
93
|
+
f"**Loop**: {trace_data.get('loop_name', 'unknown')}",
|
|
94
|
+
f"**Turns**: {trace_data.get('turn', 0)}",
|
|
95
|
+
f"",
|
|
96
|
+
f"## Steps",
|
|
97
|
+
f"",
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
for step_name, step_state in trace_data.get("step_states", {}).items():
|
|
101
|
+
status = step_state.get("status", "unknown")
|
|
102
|
+
attempts = step_state.get("attempts", 0)
|
|
103
|
+
lines.append(f"### {step_name}")
|
|
104
|
+
lines.append(f"- Status: {status}")
|
|
105
|
+
lines.append(f"- Attempts: {attempts}")
|
|
106
|
+
result = step_state.get("result")
|
|
107
|
+
if result:
|
|
108
|
+
if result.get("error"):
|
|
109
|
+
lines.append(f"- Error: {result['error']}")
|
|
110
|
+
if result.get("duration_seconds"):
|
|
111
|
+
lines.append(f"- Duration: {result['duration_seconds']:.2f}s")
|
|
112
|
+
lines.append("")
|
|
113
|
+
|
|
114
|
+
console.print("\n".join(lines))
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""Rich formatters for CLI output."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
from rich.panel import Panel
|
|
9
|
+
from rich.table import Table
|
|
10
|
+
from rich.text import Text
|
|
11
|
+
from rich.tree import Tree
|
|
12
|
+
|
|
13
|
+
console = Console()
|
|
14
|
+
error_console = Console(stderr=True)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def print_success(message: str) -> None:
|
|
18
|
+
"""Print a success message with a green checkmark."""
|
|
19
|
+
console.print(f"[bold green]✓[/bold green] {message}")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def print_error(message: str) -> None:
|
|
23
|
+
"""Print an error message with a red cross."""
|
|
24
|
+
error_console.print(f"[bold red]✗[/bold red] {message}")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def print_warning(message: str) -> None:
|
|
28
|
+
"""Print a warning message with a yellow exclamation."""
|
|
29
|
+
console.print(f"[bold yellow]⚠[/bold yellow] {message}")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def print_info(message: str) -> None:
|
|
33
|
+
"""Print an info message with a blue arrow."""
|
|
34
|
+
console.print(f"[bold blue]→[/bold blue] {message}")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def print_header(title: str, subtitle: str = "") -> None:
|
|
38
|
+
"""Print a styled header panel."""
|
|
39
|
+
content = f"[bold]{title}[/bold]"
|
|
40
|
+
if subtitle:
|
|
41
|
+
content += f"\n[dim]{subtitle}[/dim]"
|
|
42
|
+
console.print(Panel(content, border_style="blue", padding=(1, 2)))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def print_loop_spec_table(spec: dict[str, Any]) -> None:
|
|
46
|
+
"""Print a loop spec summary as a Rich table."""
|
|
47
|
+
table = Table(title="Loop Specification", show_header=True, border_style="blue")
|
|
48
|
+
table.add_column("Field", style="cyan", width=20)
|
|
49
|
+
table.add_column("Value", style="white")
|
|
50
|
+
|
|
51
|
+
for key in ("name", "version", "goal", "pattern"):
|
|
52
|
+
if key in spec:
|
|
53
|
+
table.add_row(key, str(spec[key]))
|
|
54
|
+
|
|
55
|
+
if "agents" in spec:
|
|
56
|
+
agents = ", ".join(
|
|
57
|
+
a["name"] if isinstance(a, dict) else str(a)
|
|
58
|
+
for a in spec["agents"]
|
|
59
|
+
)
|
|
60
|
+
table.add_row("agents", agents)
|
|
61
|
+
|
|
62
|
+
if "steps" in spec:
|
|
63
|
+
steps = ", ".join(
|
|
64
|
+
s["name"] if isinstance(s, dict) else str(s)
|
|
65
|
+
for s in spec["steps"]
|
|
66
|
+
)
|
|
67
|
+
table.add_row("steps", steps)
|
|
68
|
+
|
|
69
|
+
console.print(table)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def print_run_trace_tree(trace: dict[str, Any]) -> None:
|
|
73
|
+
"""Print an execution trace as a Rich tree."""
|
|
74
|
+
run_id = trace.get("run_id", "unknown")
|
|
75
|
+
status = trace.get("status", "unknown")
|
|
76
|
+
|
|
77
|
+
status_color = {
|
|
78
|
+
"completed": "green",
|
|
79
|
+
"failed": "red",
|
|
80
|
+
"running": "yellow",
|
|
81
|
+
"pending": "dim",
|
|
82
|
+
"cancelled": "red",
|
|
83
|
+
}.get(status, "white")
|
|
84
|
+
|
|
85
|
+
tree = Tree(
|
|
86
|
+
Text.from_markup(
|
|
87
|
+
f"[bold]Run:[/bold] {run_id} "
|
|
88
|
+
f"[{status_color}]{status}[/{status_color}]"
|
|
89
|
+
)
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
for step in trace.get("steps", []):
|
|
93
|
+
step_name = step.get("step_name", "unknown")
|
|
94
|
+
step_status = step.get("status", "unknown")
|
|
95
|
+
duration = step.get("duration_seconds", 0)
|
|
96
|
+
|
|
97
|
+
s_color = {
|
|
98
|
+
"completed": "green",
|
|
99
|
+
"failed": "red",
|
|
100
|
+
"running": "yellow",
|
|
101
|
+
"skipped": "dim",
|
|
102
|
+
}.get(step_status, "white")
|
|
103
|
+
|
|
104
|
+
step_node = tree.add(
|
|
105
|
+
Text.from_markup(
|
|
106
|
+
f"[{s_color}]●[/{s_color}] {step_name} "
|
|
107
|
+
f"[dim]{duration:.2f}s[/dim]"
|
|
108
|
+
)
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
if step.get("error"):
|
|
112
|
+
step_node.add(Text(f"Error: {step['error']}", style="red"))
|
|
113
|
+
|
|
114
|
+
console.print(tree)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def print_doctor_result(
|
|
118
|
+
name: str, passed: bool, detail: str = ""
|
|
119
|
+
) -> None:
|
|
120
|
+
"""Print a single diagnostic check result."""
|
|
121
|
+
icon = "[bold green]✓[/bold green]" if passed else "[bold red]✗[/bold red]"
|
|
122
|
+
msg = f"{icon} {name}"
|
|
123
|
+
if detail:
|
|
124
|
+
msg += f" [dim]{detail}[/dim]"
|
|
125
|
+
console.print(msg)
|
loopengt/cli/main.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""loopengt CLI — the main Typer application entry point."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import typer
|
|
6
|
+
|
|
7
|
+
from loopengt import __version__
|
|
8
|
+
|
|
9
|
+
app = typer.Typer(
|
|
10
|
+
name="loopengt",
|
|
11
|
+
help="Loop Engineering Agent — design, orchestrate, and evaluate agent loops.",
|
|
12
|
+
no_args_is_help=True,
|
|
13
|
+
rich_markup_mode="rich",
|
|
14
|
+
add_completion=True,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def version_callback(value: bool) -> None:
|
|
19
|
+
"""Print version and exit."""
|
|
20
|
+
if value:
|
|
21
|
+
typer.echo(f"loopengt {__version__}")
|
|
22
|
+
raise typer.Exit()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@app.callback()
|
|
26
|
+
def main(
|
|
27
|
+
version: bool = typer.Option(
|
|
28
|
+
False,
|
|
29
|
+
"--version",
|
|
30
|
+
"-v",
|
|
31
|
+
help="Show version and exit.",
|
|
32
|
+
callback=version_callback,
|
|
33
|
+
is_eager=True,
|
|
34
|
+
),
|
|
35
|
+
) -> None:
|
|
36
|
+
"""Loop Engineering Agent CLI."""
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# ── Register sub-commands ──────────────────────────────────────────────
|
|
40
|
+
|
|
41
|
+
def _register_commands() -> None:
|
|
42
|
+
"""Import and register all CLI command groups."""
|
|
43
|
+
from loopengt.cli.commands.design import design_cmd
|
|
44
|
+
from loopengt.cli.commands.doctor import doctor_cmd
|
|
45
|
+
from loopengt.cli.commands.eval import eval_cmd
|
|
46
|
+
from loopengt.cli.commands.init import init_cmd
|
|
47
|
+
from loopengt.cli.commands.mcp_serve import mcp_cmd
|
|
48
|
+
from loopengt.cli.commands.run import run_cmd
|
|
49
|
+
from loopengt.cli.commands.template import template_app
|
|
50
|
+
from loopengt.cli.commands.trace import trace_cmd
|
|
51
|
+
|
|
52
|
+
app.command("init", help="Scaffold a new .loopengt/ project directory.")(init_cmd)
|
|
53
|
+
app.command("design", help="Design a loop from a natural-language goal.")(design_cmd)
|
|
54
|
+
app.command("run", help="Execute a loop from a YAML spec.")(run_cmd)
|
|
55
|
+
app.command("trace", help="Inspect the execution trace of a run.")(trace_cmd)
|
|
56
|
+
app.command("eval", help="Run evaluations against a completed run.")(eval_cmd)
|
|
57
|
+
app.command("doctor", help="Diagnose configuration and dependencies.")(doctor_cmd)
|
|
58
|
+
app.command("mcp", help="Start the MCP server.")(mcp_cmd)
|
|
59
|
+
app.add_typer(template_app, name="template", help="Manage loop templates.")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
_register_commands()
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
if __name__ == "__main__":
|
|
66
|
+
app()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Core engine for loop design, execution, and evaluation."""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Evaluation subsystem."""
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
"""LLM-as-judge evaluators."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import structlog
|
|
8
|
+
|
|
9
|
+
logger = structlog.get_logger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class JudgeResult:
|
|
13
|
+
"""Result from an LLM judge evaluation."""
|
|
14
|
+
|
|
15
|
+
__slots__ = ("judge_name", "score", "passed", "reasoning", "raw_output")
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
judge_name: str,
|
|
20
|
+
score: float,
|
|
21
|
+
passed: bool,
|
|
22
|
+
reasoning: str = "",
|
|
23
|
+
raw_output: str = "",
|
|
24
|
+
) -> None:
|
|
25
|
+
self.judge_name = judge_name
|
|
26
|
+
self.score = score
|
|
27
|
+
self.passed = passed
|
|
28
|
+
self.reasoning = reasoning
|
|
29
|
+
self.raw_output = raw_output
|
|
30
|
+
|
|
31
|
+
def to_dict(self) -> dict[str, Any]:
|
|
32
|
+
return {
|
|
33
|
+
"judge_name": self.judge_name,
|
|
34
|
+
"score": self.score,
|
|
35
|
+
"passed": self.passed,
|
|
36
|
+
"reasoning": self.reasoning,
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class LLMJudge:
|
|
41
|
+
"""Base class for LLM-as-judge evaluators.
|
|
42
|
+
|
|
43
|
+
Subclass and implement ``_build_prompt`` and ``_parse_response``
|
|
44
|
+
to create custom judges.
|
|
45
|
+
|
|
46
|
+
Usage::
|
|
47
|
+
|
|
48
|
+
judge = GoalAchievementJudge()
|
|
49
|
+
result = await judge.evaluate(trace_data, goal="build an API")
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
def __init__(self, name: str = "llm_judge") -> None:
|
|
53
|
+
self._name = name
|
|
54
|
+
self._log = logger.bind(judge=name)
|
|
55
|
+
|
|
56
|
+
async def evaluate(
|
|
57
|
+
self, trace: dict[str, Any], **kwargs: Any
|
|
58
|
+
) -> JudgeResult:
|
|
59
|
+
"""Evaluate a loop run trace.
|
|
60
|
+
|
|
61
|
+
Returns a ``JudgeResult`` with score and reasoning.
|
|
62
|
+
In stub mode (no LLM configured), returns a default result.
|
|
63
|
+
"""
|
|
64
|
+
prompt = self._build_prompt(trace, **kwargs)
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
response = await self._call_llm(prompt)
|
|
68
|
+
return self._parse_response(response)
|
|
69
|
+
except Exception as exc: # noqa: BLE001
|
|
70
|
+
self._log.warning("judge.error", error=str(exc))
|
|
71
|
+
return JudgeResult(
|
|
72
|
+
judge_name=self._name,
|
|
73
|
+
score=0.0,
|
|
74
|
+
passed=False,
|
|
75
|
+
reasoning=f"Judge failed: {exc}",
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
def _build_prompt(self, trace: dict[str, Any], **kwargs: Any) -> str:
|
|
79
|
+
"""Build the evaluation prompt. Override in subclasses."""
|
|
80
|
+
return (
|
|
81
|
+
f"Evaluate this loop execution trace:\n"
|
|
82
|
+
f"{trace}\n\n"
|
|
83
|
+
f"Score from 0.0 to 1.0 and explain your reasoning."
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
def _parse_response(self, response: str) -> JudgeResult:
|
|
87
|
+
"""Parse the LLM response. Override in subclasses."""
|
|
88
|
+
return JudgeResult(
|
|
89
|
+
judge_name=self._name,
|
|
90
|
+
score=0.5,
|
|
91
|
+
passed=True,
|
|
92
|
+
reasoning=response,
|
|
93
|
+
raw_output=response,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
async def _call_llm(self, prompt: str) -> str:
|
|
97
|
+
"""Call the LLM to evaluate the trace."""
|
|
98
|
+
try:
|
|
99
|
+
import os
|
|
100
|
+
from openai import AsyncOpenAI
|
|
101
|
+
except ImportError:
|
|
102
|
+
self._log.warning("openai_not_installed", fallback="stub")
|
|
103
|
+
return '{"score": 0.5, "reasoning": "[stub] Install loopengt[llm] for real evaluation"}'
|
|
104
|
+
|
|
105
|
+
provider = os.environ.get("LOOPENGT_LLM_PROVIDER", "openai").lower()
|
|
106
|
+
if provider == "huggingface":
|
|
107
|
+
api_key = os.environ.get("HF_TOKEN")
|
|
108
|
+
base_url = "https://api-inference.huggingface.co/v1/"
|
|
109
|
+
else:
|
|
110
|
+
api_key = os.environ.get("OPENAI_API_KEY")
|
|
111
|
+
base_url = None
|
|
112
|
+
|
|
113
|
+
if not api_key:
|
|
114
|
+
self._log.warning("no_api_key", fallback="stub")
|
|
115
|
+
return '{"score": 0.5, "reasoning": "[stub] Missing API key"}'
|
|
116
|
+
|
|
117
|
+
client = AsyncOpenAI(api_key=api_key, base_url=base_url)
|
|
118
|
+
model = os.environ.get("LOOPENGT_EVAL_MODEL", "gpt-4o")
|
|
119
|
+
|
|
120
|
+
try:
|
|
121
|
+
response = await client.chat.completions.create(
|
|
122
|
+
model=model,
|
|
123
|
+
messages=[
|
|
124
|
+
{"role": "system", "content": "You are an objective AI evaluator. Return JSON format."},
|
|
125
|
+
{"role": "user", "content": prompt}
|
|
126
|
+
]
|
|
127
|
+
)
|
|
128
|
+
return response.choices[0].message.content or ""
|
|
129
|
+
except Exception as e:
|
|
130
|
+
self._log.error("llm_eval_failed", error=str(e))
|
|
131
|
+
return f'{{"score": 0.0, "reasoning": "LLM call failed: {e}"}}'
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class GoalAchievementJudge(LLMJudge):
|
|
135
|
+
"""Judge that evaluates whether the loop achieved its stated goal."""
|
|
136
|
+
|
|
137
|
+
def __init__(self) -> None:
|
|
138
|
+
super().__init__(name="goal_achievement")
|
|
139
|
+
|
|
140
|
+
def _build_prompt(self, trace: dict[str, Any], **kwargs: Any) -> str:
|
|
141
|
+
goal = kwargs.get("goal", trace.get("context", {}).get("goal", "unknown"))
|
|
142
|
+
status = trace.get("status", "unknown")
|
|
143
|
+
history = trace.get("history", [])
|
|
144
|
+
|
|
145
|
+
return (
|
|
146
|
+
f"## Goal\n{goal}\n\n"
|
|
147
|
+
f"## Run Status\n{status}\n\n"
|
|
148
|
+
f"## Execution History\n{history}\n\n"
|
|
149
|
+
f"## Instructions\n"
|
|
150
|
+
f"Score this run from 0.0 to 1.0 on goal achievement.\n"
|
|
151
|
+
f"Respond with JSON: {{\"score\": float, \"reasoning\": \"...\"}}"
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
def _parse_response(self, response: str) -> JudgeResult:
|
|
155
|
+
import json
|
|
156
|
+
score = 0.0
|
|
157
|
+
reasoning = "Failed to parse JSON"
|
|
158
|
+
passed = False
|
|
159
|
+
try:
|
|
160
|
+
content = response.strip()
|
|
161
|
+
if content.startswith("```json"):
|
|
162
|
+
content = content[7:-3].strip()
|
|
163
|
+
data = json.loads(content)
|
|
164
|
+
score = float(data.get("score", 0.0))
|
|
165
|
+
reasoning = data.get("reasoning", "")
|
|
166
|
+
passed = score >= 0.8
|
|
167
|
+
except (json.JSONDecodeError, ValueError):
|
|
168
|
+
reasoning = response
|
|
169
|
+
|
|
170
|
+
return JudgeResult(
|
|
171
|
+
judge_name=self._name,
|
|
172
|
+
score=score,
|
|
173
|
+
passed=passed,
|
|
174
|
+
reasoning=reasoning,
|
|
175
|
+
raw_output=response,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
class QualityJudge(LLMJudge):
|
|
180
|
+
"""Judge that evaluates the overall quality of loop outputs."""
|
|
181
|
+
|
|
182
|
+
def __init__(self) -> None:
|
|
183
|
+
super().__init__(name="quality")
|
|
184
|
+
|
|
185
|
+
def _build_prompt(self, trace: dict[str, Any], **kwargs: Any) -> str:
|
|
186
|
+
return (
|
|
187
|
+
f"Evaluate the quality of this loop's outputs:\n"
|
|
188
|
+
f"{trace.get('history', [])}\n\n"
|
|
189
|
+
f"Consider: completeness, correctness, clarity, efficiency.\n"
|
|
190
|
+
f"Score from 0.0 to 1.0 with reasoning.\n"
|
|
191
|
+
f"Respond with JSON: {{\"score\": float, \"reasoning\": \"...\"}}"
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
def _parse_response(self, response: str) -> JudgeResult:
|
|
195
|
+
import json
|
|
196
|
+
score = 0.0
|
|
197
|
+
reasoning = "Failed to parse JSON"
|
|
198
|
+
passed = False
|
|
199
|
+
try:
|
|
200
|
+
content = response.strip()
|
|
201
|
+
if content.startswith("```json"):
|
|
202
|
+
content = content[7:-3].strip()
|
|
203
|
+
data = json.loads(content)
|
|
204
|
+
score = float(data.get("score", 0.0))
|
|
205
|
+
reasoning = data.get("reasoning", "")
|
|
206
|
+
passed = score >= 0.8
|
|
207
|
+
except (json.JSONDecodeError, ValueError):
|
|
208
|
+
reasoning = response
|
|
209
|
+
|
|
210
|
+
return JudgeResult(
|
|
211
|
+
judge_name=self._name,
|
|
212
|
+
score=score,
|
|
213
|
+
passed=passed,
|
|
214
|
+
reasoning=reasoning,
|
|
215
|
+
raw_output=response,
|
|
216
|
+
)
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Built-in evaluation metrics for loop runs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import structlog
|
|
8
|
+
|
|
9
|
+
logger = structlog.get_logger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class MetricResult:
|
|
13
|
+
"""Result of a single metric evaluation."""
|
|
14
|
+
|
|
15
|
+
__slots__ = ("name", "value", "passed", "detail")
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self, name: str, value: Any, passed: bool, detail: str = ""
|
|
19
|
+
) -> None:
|
|
20
|
+
self.name = name
|
|
21
|
+
self.value = value
|
|
22
|
+
self.passed = passed
|
|
23
|
+
self.detail = detail
|
|
24
|
+
|
|
25
|
+
def to_dict(self) -> dict[str, Any]:
|
|
26
|
+
return {
|
|
27
|
+
"name": self.name,
|
|
28
|
+
"value": self.value,
|
|
29
|
+
"passed": self.passed,
|
|
30
|
+
"detail": self.detail,
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def total_turns(trace: dict[str, Any]) -> MetricResult:
|
|
35
|
+
"""Count total turns executed."""
|
|
36
|
+
turns = trace.get("turn", 0)
|
|
37
|
+
return MetricResult("total_turns", turns, True)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def total_steps(trace: dict[str, Any]) -> MetricResult:
|
|
41
|
+
"""Count total steps defined."""
|
|
42
|
+
count = len(trace.get("step_states", {}))
|
|
43
|
+
return MetricResult("total_steps", count, True)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def completed_steps(trace: dict[str, Any]) -> MetricResult:
|
|
47
|
+
"""Count successfully completed steps."""
|
|
48
|
+
count = sum(
|
|
49
|
+
1
|
|
50
|
+
for s in trace.get("step_states", {}).values()
|
|
51
|
+
if s.get("status") == "completed"
|
|
52
|
+
)
|
|
53
|
+
return MetricResult("completed_steps", count, True)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def failed_steps(trace: dict[str, Any]) -> MetricResult:
|
|
57
|
+
"""Count failed steps."""
|
|
58
|
+
count = sum(
|
|
59
|
+
1
|
|
60
|
+
for s in trace.get("step_states", {}).values()
|
|
61
|
+
if s.get("status") == "failed"
|
|
62
|
+
)
|
|
63
|
+
return MetricResult("failed_steps", count, count == 0)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def success_rate(trace: dict[str, Any]) -> MetricResult:
|
|
67
|
+
"""Calculate step success rate."""
|
|
68
|
+
total = len(trace.get("step_states", {}))
|
|
69
|
+
if total == 0:
|
|
70
|
+
return MetricResult("success_rate", 0.0, False, "No steps")
|
|
71
|
+
|
|
72
|
+
completed = sum(
|
|
73
|
+
1
|
|
74
|
+
for s in trace.get("step_states", {}).values()
|
|
75
|
+
if s.get("status") == "completed"
|
|
76
|
+
)
|
|
77
|
+
rate = completed / total
|
|
78
|
+
return MetricResult("success_rate", round(rate, 4), rate >= 0.5)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def total_duration(trace: dict[str, Any]) -> MetricResult:
|
|
82
|
+
"""Sum of all step durations."""
|
|
83
|
+
duration = sum(
|
|
84
|
+
(s.get("result") or {}).get("duration_seconds", 0)
|
|
85
|
+
for s in trace.get("step_states", {}).values()
|
|
86
|
+
)
|
|
87
|
+
return MetricResult("total_duration_seconds", round(duration, 2), True)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def run_status(trace: dict[str, Any]) -> MetricResult:
|
|
91
|
+
"""Check overall run status."""
|
|
92
|
+
status = trace.get("status", "unknown")
|
|
93
|
+
return MetricResult("run_status", status, status == "completed")
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# Registry of all built-in metrics
|
|
97
|
+
BUILTIN_METRICS = {
|
|
98
|
+
"total_turns": total_turns,
|
|
99
|
+
"total_steps": total_steps,
|
|
100
|
+
"completed_steps": completed_steps,
|
|
101
|
+
"failed_steps": failed_steps,
|
|
102
|
+
"success_rate": success_rate,
|
|
103
|
+
"total_duration": total_duration,
|
|
104
|
+
"run_status": run_status,
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def evaluate_all(
|
|
109
|
+
trace: dict[str, Any],
|
|
110
|
+
metric_names: list[str] | None = None,
|
|
111
|
+
) -> list[MetricResult]:
|
|
112
|
+
"""Run all (or selected) metrics against a trace."""
|
|
113
|
+
selected = metric_names or list(BUILTIN_METRICS.keys())
|
|
114
|
+
results = []
|
|
115
|
+
for name in selected:
|
|
116
|
+
fn = BUILTIN_METRICS.get(name)
|
|
117
|
+
if fn:
|
|
118
|
+
results.append(fn(trace))
|
|
119
|
+
return results
|