fred-deepeval-cli 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fred_deepeval_cli-0.1.0/PKG-INFO +70 -0
- fred_deepeval_cli-0.1.0/README.md +46 -0
- fred_deepeval_cli-0.1.0/fred_deepeval_cli/__init__.py +0 -0
- fred_deepeval_cli-0.1.0/fred_deepeval_cli/cli/__init__.py +0 -0
- fred_deepeval_cli-0.1.0/fred_deepeval_cli/cli/display.py +208 -0
- fred_deepeval_cli-0.1.0/fred_deepeval_cli/cli/main.py +100 -0
- fred_deepeval_cli-0.1.0/fred_deepeval_cli/core/__init__.py +0 -0
- fred_deepeval_cli-0.1.0/fred_deepeval_cli/core/config_loader.py +62 -0
- fred_deepeval_cli-0.1.0/fred_deepeval_cli/core/evaluator.py +94 -0
- fred_deepeval_cli-0.1.0/fred_deepeval_cli/core/judge_factory.py +52 -0
- fred_deepeval_cli-0.1.0/fred_deepeval_cli/core/models.py +41 -0
- fred_deepeval_cli-0.1.0/fred_deepeval_cli/core/profiles.py +21 -0
- fred_deepeval_cli-0.1.0/fred_deepeval_cli/core/scorer.py +74 -0
- fred_deepeval_cli-0.1.0/fred_deepeval_cli/core/structural_checks.py +55 -0
- fred_deepeval_cli-0.1.0/fred_deepeval_cli/dataset_workflow.py +115 -0
- fred_deepeval_cli-0.1.0/fred_deepeval_cli/test_helpers.py +44 -0
- fred_deepeval_cli-0.1.0/fred_deepeval_cli/worker_adapter.py +22 -0
- fred_deepeval_cli-0.1.0/fred_deepeval_cli.egg-info/PKG-INFO +70 -0
- fred_deepeval_cli-0.1.0/fred_deepeval_cli.egg-info/SOURCES.txt +33 -0
- fred_deepeval_cli-0.1.0/fred_deepeval_cli.egg-info/dependency_links.txt +1 -0
- fred_deepeval_cli-0.1.0/fred_deepeval_cli.egg-info/entry_points.txt +2 -0
- fred_deepeval_cli-0.1.0/fred_deepeval_cli.egg-info/requires.txt +18 -0
- fred_deepeval_cli-0.1.0/fred_deepeval_cli.egg-info/top_level.txt +1 -0
- fred_deepeval_cli-0.1.0/pyproject.toml +45 -0
- fred_deepeval_cli-0.1.0/setup.cfg +4 -0
- fred_deepeval_cli-0.1.0/tests/test_classify.py +33 -0
- fred_deepeval_cli-0.1.0/tests/test_cli_parser.py +38 -0
- fred_deepeval_cli-0.1.0/tests/test_deepeval_adapter.py +16 -0
- fred_deepeval_cli-0.1.0/tests/test_deepeval_runner.py +65 -0
- fred_deepeval_cli-0.1.0/tests/test_eval_client.py +26 -0
- fred_deepeval_cli-0.1.0/tests/test_main.py +86 -0
- fred_deepeval_cli-0.1.0/tests/test_preset_resolver.py +23 -0
- fred_deepeval_cli-0.1.0/tests/test_run_sql_scenarios.py +8 -0
- fred_deepeval_cli-0.1.0/tests/test_sql_structural_checks.py +60 -0
- fred_deepeval_cli-0.1.0/tests/test_structural_checks.py +42 -0
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fred-deepeval-cli
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: External CLI for evaluating Fred agent turns via /agents/evaluate
|
|
5
|
+
License: Apache-2.0
|
|
6
|
+
Requires-Python: <3.13,>=3.12
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: fred-sdk>=2.0.7
|
|
9
|
+
Requires-Dist: fred-runtime>=2.0.8
|
|
10
|
+
Provides-Extra: dev
|
|
11
|
+
Requires-Dist: bandit>=1.8.6; extra == "dev"
|
|
12
|
+
Requires-Dist: basedpyright==1.31.0; extra == "dev"
|
|
13
|
+
Requires-Dist: detect-secrets>=1.5.0; extra == "dev"
|
|
14
|
+
Requires-Dist: pytest>=8.4.2; extra == "dev"
|
|
15
|
+
Requires-Dist: pytest-cov>=6.2.1; extra == "dev"
|
|
16
|
+
Requires-Dist: pytest-socket>=0.7.0; extra == "dev"
|
|
17
|
+
Requires-Dist: ruff>=0.12.5; extra == "dev"
|
|
18
|
+
Provides-Extra: eval
|
|
19
|
+
Requires-Dist: deepeval; extra == "eval"
|
|
20
|
+
Requires-Dist: litellm; extra == "eval"
|
|
21
|
+
Requires-Dist: python-dotenv; extra == "eval"
|
|
22
|
+
Requires-Dist: rich>=13.0; extra == "eval"
|
|
23
|
+
Requires-Dist: temporalio; extra == "eval"
|
|
24
|
+
|
|
25
|
+
# fred-deepeval-cli
|
|
26
|
+
|
|
27
|
+
External CLI for evaluating one Fred agent turn through `POST /agents/evaluate`.
|
|
28
|
+
|
|
29
|
+
## Purpose
|
|
30
|
+
|
|
31
|
+
This project provides a small external CLI that:
|
|
32
|
+
- calls a Fred pod `/agents/evaluate` endpoint
|
|
33
|
+
- receives an `EvalTrace`
|
|
34
|
+
- classifies the turn outcome
|
|
35
|
+
- resolves an evaluation preset from `agent_tags`
|
|
36
|
+
- computes structural checks
|
|
37
|
+
- scores the trace with DeepEval
|
|
38
|
+
|
|
39
|
+
## Commands
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
make dev
|
|
43
|
+
make eval-dev
|
|
44
|
+
make test
|
|
45
|
+
make code-quality
|
|
46
|
+
make cli
|
|
47
|
+
make score BASE_URL=http://127.0.0.1:8000/fred/agents/v2 AGENT_ID=fred.test.assistant INPUT="echo bonjour" SESSION_ID=eval-001 USER_ID=alice
|
|
48
|
+
make sql-scenarios BASE_URL=http://127.0.0.1:8000/fred/agents/v2
|
|
49
|
+
|
|
50
|
+
## Documentation
|
|
51
|
+
|
|
52
|
+
| Topic | File |
|
|
53
|
+
| --- | --- |
|
|
54
|
+
| Evaluate any Fred agent pod | `docs/evaluating-any-fred-agent.md` |
|
|
55
|
+
| RAG evaluation — approach and metrics | `docs/rag-evaluation-rfc.md` |
|
|
56
|
+
| RAG local setup guide | `docs/rag-local-setup.md` |
|
|
57
|
+
| SQL evaluation | `docs/sql-evaluation.md` |
|
|
58
|
+
| OTel export strategy | `fred/docs/swift/rfc/AGENT-EVALUATION-RFC.md §13` |
|
|
59
|
+
|
|
60
|
+
## Architecture — EVAL-01 Phase 1
|
|
61
|
+
|
|
62
|
+
This CLI is being restructured into a reusable library core so the Fred
|
|
63
|
+
Control Plane evaluation worker can call it directly without spawning a subprocess.
|
|
64
|
+
|
|
65
|
+
- `fred_deepeval_cli/core/` — callable library (models, evaluator, profiles, scorer, judge factory)
|
|
66
|
+
- `fred_deepeval_cli/cli/` — thin CLI adapter over the core
|
|
67
|
+
- `fred_deepeval_cli/worker_adapter.py` — public entrypoint for the Control Plane worker
|
|
68
|
+
|
|
69
|
+
The CLI interface and JSON output remain unchanged.
|
|
70
|
+
See EVAL-01 Phase 1 issue and `fred/docs/swift/rfc/AGENT-EVALUATION-RFC.md §7.3`.
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# fred-deepeval-cli
|
|
2
|
+
|
|
3
|
+
External CLI for evaluating one Fred agent turn through `POST /agents/evaluate`.
|
|
4
|
+
|
|
5
|
+
## Purpose
|
|
6
|
+
|
|
7
|
+
This project provides a small external CLI that:
|
|
8
|
+
- calls a Fred pod `/agents/evaluate` endpoint
|
|
9
|
+
- receives an `EvalTrace`
|
|
10
|
+
- classifies the turn outcome
|
|
11
|
+
- resolves an evaluation preset from `agent_tags`
|
|
12
|
+
- computes structural checks
|
|
13
|
+
- scores the trace with DeepEval
|
|
14
|
+
|
|
15
|
+
## Commands
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
make dev
|
|
19
|
+
make eval-dev
|
|
20
|
+
make test
|
|
21
|
+
make code-quality
|
|
22
|
+
make cli
|
|
23
|
+
make score BASE_URL=http://127.0.0.1:8000/fred/agents/v2 AGENT_ID=fred.test.assistant INPUT="echo bonjour" SESSION_ID=eval-001 USER_ID=alice
|
|
24
|
+
make sql-scenarios BASE_URL=http://127.0.0.1:8000/fred/agents/v2
|
|
25
|
+
|
|
26
|
+
## Documentation
|
|
27
|
+
|
|
28
|
+
| Topic | File |
|
|
29
|
+
| --- | --- |
|
|
30
|
+
| Evaluate any Fred agent pod | `docs/evaluating-any-fred-agent.md` |
|
|
31
|
+
| RAG evaluation — approach and metrics | `docs/rag-evaluation-rfc.md` |
|
|
32
|
+
| RAG local setup guide | `docs/rag-local-setup.md` |
|
|
33
|
+
| SQL evaluation | `docs/sql-evaluation.md` |
|
|
34
|
+
| OTel export strategy | `fred/docs/swift/rfc/AGENT-EVALUATION-RFC.md §13` |
|
|
35
|
+
|
|
36
|
+
## Architecture — EVAL-01 Phase 1
|
|
37
|
+
|
|
38
|
+
This CLI is being restructured into a reusable library core so the Fred
|
|
39
|
+
Control Plane evaluation worker can call it directly without spawning a subprocess.
|
|
40
|
+
|
|
41
|
+
- `fred_deepeval_cli/core/` — callable library (models, evaluator, profiles, scorer, judge factory)
|
|
42
|
+
- `fred_deepeval_cli/cli/` — thin CLI adapter over the core
|
|
43
|
+
- `fred_deepeval_cli/worker_adapter.py` — public entrypoint for the Control Plane worker
|
|
44
|
+
|
|
45
|
+
The CLI interface and JSON output remain unchanged.
|
|
46
|
+
See EVAL-01 Phase 1 issue and `fred/docs/swift/rfc/AGENT-EVALUATION-RFC.md §7.3`.
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from rich.console import Console
|
|
4
|
+
from rich.panel import Panel
|
|
5
|
+
from rich.table import Table
|
|
6
|
+
from rich import box
|
|
7
|
+
from rich.text import Text
|
|
8
|
+
|
|
9
|
+
from fred_deepeval_cli.core.models import EvaluationCaseRequest, EvaluationCaseResult
|
|
10
|
+
|
|
11
|
+
console = Console(stderr=True)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _check_icon(value: object) -> str:
|
|
15
|
+
return "✅" if value is True else "❌"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _outcome_text(outcome: str) -> Text:
|
|
19
|
+
if outcome == "execution_error":
|
|
20
|
+
return Text(f" {outcome}", style="bold red")
|
|
21
|
+
return Text(f" {outcome}", style="bold green")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def render_score(
|
|
25
|
+
result: EvaluationCaseResult,
|
|
26
|
+
request: EvaluationCaseRequest | None = None,
|
|
27
|
+
) -> None:
|
|
28
|
+
# ── Header ──────────────────────────────────────────────────────────────
|
|
29
|
+
header = Table.grid(padding=(0, 2))
|
|
30
|
+
header.add_column(style="bold cyan")
|
|
31
|
+
header.add_column()
|
|
32
|
+
if request:
|
|
33
|
+
header.add_row("Agent", request.agent_id)
|
|
34
|
+
header.add_row("Session", request.session_id)
|
|
35
|
+
header.add_row("Input", request.input)
|
|
36
|
+
header.add_row("Profile", result.profile)
|
|
37
|
+
|
|
38
|
+
console.print()
|
|
39
|
+
console.print(Panel(header, title="[bold]fred-deepeval-cli[/bold]", border_style="cyan"))
|
|
40
|
+
|
|
41
|
+
# ── Output agent ────────────────────────────────────────────────────────
|
|
42
|
+
agent_output = result.actual_output or "—"
|
|
43
|
+
console.print(Panel(agent_output, title="Output", border_style="yellow"))
|
|
44
|
+
|
|
45
|
+
# ── Outcome ─────────────────────────────────────────────────────────────
|
|
46
|
+
console.print(Panel(
|
|
47
|
+
_outcome_text(result.outcome),
|
|
48
|
+
title="Outcome",
|
|
49
|
+
border_style="green" if result.outcome != "execution_error" else "red",
|
|
50
|
+
))
|
|
51
|
+
|
|
52
|
+
# ── Structural Checks ───────────────────────────────────────────────────
|
|
53
|
+
if result.structural_checks:
|
|
54
|
+
table = Table(box=box.SIMPLE, show_header=True, header_style="bold magenta")
|
|
55
|
+
table.add_column("Check", style="cyan")
|
|
56
|
+
table.add_column("", justify="center")
|
|
57
|
+
|
|
58
|
+
for check in result.structural_checks:
|
|
59
|
+
table.add_row(check.name, _check_icon(check.passed))
|
|
60
|
+
|
|
61
|
+
console.print(Panel(table, title=f"Structural Checks [{result.profile}]", border_style="magenta"))
|
|
62
|
+
|
|
63
|
+
# ── DeepEval Metrics ────────────────────────────────────────────────────
|
|
64
|
+
if result.metrics:
|
|
65
|
+
table = Table(box=box.SIMPLE, show_header=True, header_style="bold blue")
|
|
66
|
+
table.add_column("Metric", style="cyan")
|
|
67
|
+
table.add_column("Score", justify="right")
|
|
68
|
+
table.add_column("", justify="center")
|
|
69
|
+
table.add_column("Reason", style="dim", no_wrap=False, max_width=60)
|
|
70
|
+
|
|
71
|
+
for m in result.metrics:
|
|
72
|
+
score_str = f"{m.score:.2f}" if isinstance(m.score, float) else "—"
|
|
73
|
+
icon = "✅" if m.verdict == "passed" else ("⏭" if m.verdict == "skipped" else "❌")
|
|
74
|
+
table.add_row(m.name, score_str, icon, m.explanation or m.error or "—")
|
|
75
|
+
|
|
76
|
+
console.print(Panel(table, title="DeepEval Metrics", border_style="blue"))
|
|
77
|
+
|
|
78
|
+
# ── Erreurs ─────────────────────────────────────────────────────────────
|
|
79
|
+
if result.scoring_errors:
|
|
80
|
+
console.print(Panel(
|
|
81
|
+
"\n".join(result.scoring_errors),
|
|
82
|
+
title="Scoring Errors",
|
|
83
|
+
border_style="red",
|
|
84
|
+
))
|
|
85
|
+
|
|
86
|
+
console.print()
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# ── Campagne ────────────────────────────────────────────────────────────────
|
|
90
|
+
|
|
91
|
+
_CAMPAIGN_METRICS = [
|
|
92
|
+
"AnswerRelevancyMetric",
|
|
93
|
+
"FaithfulnessMetric",
|
|
94
|
+
"ContextualRelevancyMetric",
|
|
95
|
+
"ContextualPrecisionMetric",
|
|
96
|
+
"ContextualRecallMetric",
|
|
97
|
+
]
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _fmt_score(metrics_by_name: dict, name: str, totals: dict) -> str:
|
|
101
|
+
m = metrics_by_name.get(name)
|
|
102
|
+
if m is None:
|
|
103
|
+
return "—"
|
|
104
|
+
score = m.get("score")
|
|
105
|
+
if score is None:
|
|
106
|
+
return "—"
|
|
107
|
+
totals[name].append(score)
|
|
108
|
+
icon = "✅" if m.get("verdict") == "passed" else "❌"
|
|
109
|
+
return f"{score:.2f}{icon}"
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def render_campaign(results: list[dict]) -> None:
|
|
113
|
+
"""Affiche le tableau récapitulatif d'une campagne RAG."""
|
|
114
|
+
totals: dict[str, list[float]] = {m: [] for m in _CAMPAIGN_METRICS}
|
|
115
|
+
|
|
116
|
+
table = Table(box=box.SIMPLE, show_header=True, header_style="bold cyan")
|
|
117
|
+
table.add_column("ID", style="dim", width=22)
|
|
118
|
+
table.add_column("Outcome", width=10)
|
|
119
|
+
table.add_column("RAG", justify="center", width=5)
|
|
120
|
+
table.add_column("AnswerRel", justify="right", width=10)
|
|
121
|
+
table.add_column("Faithful", justify="right", width=10)
|
|
122
|
+
table.add_column("CtxRel", justify="right", width=8)
|
|
123
|
+
table.add_column("CtxPrec", justify="right", width=9)
|
|
124
|
+
table.add_column("CtxRecall", justify="right", width=10)
|
|
125
|
+
|
|
126
|
+
for r in results:
|
|
127
|
+
raw_metrics = r.get("metrics", {})
|
|
128
|
+
metrics_by_name = raw_metrics if isinstance(raw_metrics, dict) else {m["name"]: m for m in raw_metrics}
|
|
129
|
+
table.add_row(
|
|
130
|
+
r["id"],
|
|
131
|
+
r["outcome"],
|
|
132
|
+
"✅" if r.get("rag_ok") else "❌",
|
|
133
|
+
_fmt_score(metrics_by_name, "AnswerRelevancyMetric", totals),
|
|
134
|
+
_fmt_score(metrics_by_name, "FaithfulnessMetric", totals),
|
|
135
|
+
_fmt_score(metrics_by_name, "ContextualRelevancyMetric", totals),
|
|
136
|
+
_fmt_score(metrics_by_name, "ContextualPrecisionMetric", totals),
|
|
137
|
+
_fmt_score(metrics_by_name, "ContextualRecallMetric", totals),
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
console.print()
|
|
141
|
+
console.print(Panel(table, title="Résultats par scénario", border_style="cyan"))
|
|
142
|
+
|
|
143
|
+
# ── Moyennes ─────────────────────────────────────────────────────────────
|
|
144
|
+
avg_table = Table(box=box.SIMPLE, show_header=True, header_style="bold blue")
|
|
145
|
+
avg_table.add_column("Métrique", style="cyan")
|
|
146
|
+
avg_table.add_column("Moyenne", justify="right")
|
|
147
|
+
avg_table.add_column("N", justify="right", style="dim")
|
|
148
|
+
|
|
149
|
+
overall: list[float] = []
|
|
150
|
+
for name in _CAMPAIGN_METRICS:
|
|
151
|
+
scores = totals[name]
|
|
152
|
+
if scores:
|
|
153
|
+
avg = sum(scores) / len(scores)
|
|
154
|
+
overall.append(avg)
|
|
155
|
+
avg_table.add_row(name, f"{avg:.4f} ({avg * 100:.1f}%)", str(len(scores)))
|
|
156
|
+
else:
|
|
157
|
+
avg_table.add_row(name, "—", "0")
|
|
158
|
+
|
|
159
|
+
if overall:
|
|
160
|
+
global_avg = sum(overall) / len(overall)
|
|
161
|
+
avg_table.add_row(
|
|
162
|
+
"OVERALL",
|
|
163
|
+
f"{global_avg:.4f} ({global_avg * 100:.1f}%)",
|
|
164
|
+
"",
|
|
165
|
+
style="bold",
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
console.print(Panel(avg_table, title="Moyennes par métrique", border_style="blue"))
|
|
169
|
+
console.print()
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def render_sql_campaign(results: list[dict]) -> None:
|
|
173
|
+
"""Affiche le tableau récapitulatif d'une campagne SQL."""
|
|
174
|
+
table = Table(box=box.SIMPLE, show_header=True, header_style="bold cyan")
|
|
175
|
+
table.add_column("ID", style="dim", width=22)
|
|
176
|
+
table.add_column("Outcome", width=12)
|
|
177
|
+
table.add_column("Query exec", justify="center", width=12)
|
|
178
|
+
table.add_column("No error", justify="center", width=10)
|
|
179
|
+
table.add_column("Pass", justify="center", width=6)
|
|
180
|
+
table.add_column("Failures", style="dim", no_wrap=False, max_width=50)
|
|
181
|
+
|
|
182
|
+
passed = 0
|
|
183
|
+
for r in results:
|
|
184
|
+
checks = r.get("observed_checks", {})
|
|
185
|
+
failures = r.get("failures", [])
|
|
186
|
+
is_pass = r.get("pass", False)
|
|
187
|
+
if is_pass:
|
|
188
|
+
passed += 1
|
|
189
|
+
|
|
190
|
+
table.add_row(
|
|
191
|
+
r["id"],
|
|
192
|
+
r["outcome"],
|
|
193
|
+
"✅" if checks.get("sql_query_executed") else "❌",
|
|
194
|
+
"✅" if checks.get("sql_no_execution_error") else "❌",
|
|
195
|
+
"✅" if is_pass else "❌",
|
|
196
|
+
" | ".join(failures) if failures else "—",
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
console.print()
|
|
200
|
+
console.print(Panel(table, title="Résultats SQL par scénario", border_style="cyan"))
|
|
201
|
+
|
|
202
|
+
total = len(results)
|
|
203
|
+
color = "green" if passed == total else "yellow" if passed > 0 else "red"
|
|
204
|
+
console.print(Panel(
|
|
205
|
+
f"[bold {color}]{passed}/{total} scénarios passés[/bold {color}]",
|
|
206
|
+
border_style=color,
|
|
207
|
+
))
|
|
208
|
+
console.print()
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
|
|
7
|
+
from fred_deepeval_cli.core.models import EvaluationCaseRequest
|
|
8
|
+
from fred_deepeval_cli.core.evaluator import evaluate_case_sync
|
|
9
|
+
from fred_deepeval_cli.core.judge_factory import build_judge
|
|
10
|
+
from fred_deepeval_cli.cli.display import render_score
|
|
11
|
+
from dotenv import load_dotenv
|
|
12
|
+
|
|
13
|
+
dotenv_path = os.getenv("ENV_FILE", "./config/.env")
|
|
14
|
+
load_dotenv(dotenv_path)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
18
|
+
parser = argparse.ArgumentParser(
|
|
19
|
+
prog="fred-deepeval-cli",
|
|
20
|
+
description="External CLI for evaluating Fred agent turns.",
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
24
|
+
|
|
25
|
+
score_parser = subparsers.add_parser(
|
|
26
|
+
"score",
|
|
27
|
+
help="Evaluate one Fred agent turn and score it with DeepEval.",
|
|
28
|
+
)
|
|
29
|
+
add_shared_eval_args(score_parser)
|
|
30
|
+
|
|
31
|
+
return parser
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def add_shared_eval_args(parser: argparse.ArgumentParser) -> None:
|
|
35
|
+
parser.add_argument("--base-url", required=True, help="Fred pod base URL.")
|
|
36
|
+
parser.add_argument("--agent-id", required=True, help="Agent identifier.")
|
|
37
|
+
parser.add_argument("--input", required=True, help="User input to evaluate.")
|
|
38
|
+
parser.add_argument("--session-id", required=True, help="Session identifier.")
|
|
39
|
+
parser.add_argument("--user-id", required=True, help="Runtime user identifier.")
|
|
40
|
+
parser.add_argument("--team-id", help="Optional runtime team identifier.")
|
|
41
|
+
parser.add_argument(
|
|
42
|
+
"--access-token",
|
|
43
|
+
default=os.environ.get("FRED_ACCESS_TOKEN"),
|
|
44
|
+
help="Optional bearer token for authenticated agent evaluation.",
|
|
45
|
+
)
|
|
46
|
+
parser.add_argument(
|
|
47
|
+
"--search-policy",
|
|
48
|
+
default=os.environ.get("FRED_SEARCH_POLICY"),
|
|
49
|
+
help="Optional runtime search policy override (for example: semantic).",
|
|
50
|
+
)
|
|
51
|
+
parser.add_argument(
|
|
52
|
+
"--profile",
|
|
53
|
+
default="auto",
|
|
54
|
+
choices=["auto", "rag", "sql", "workflow", "default"],
|
|
55
|
+
help="Evaluation profile. Defaults to auto-detection from agent_tags.",
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def run_score(args: argparse.Namespace) -> int:
|
|
60
|
+
runtime_context: dict = {"user_id": args.user_id}
|
|
61
|
+
if args.team_id:
|
|
62
|
+
runtime_context["team_id"] = args.team_id
|
|
63
|
+
if args.search_policy:
|
|
64
|
+
runtime_context["search_policy"] = args.search_policy
|
|
65
|
+
|
|
66
|
+
request = EvaluationCaseRequest(
|
|
67
|
+
agent_id=args.agent_id,
|
|
68
|
+
input=args.input,
|
|
69
|
+
session_id=args.session_id,
|
|
70
|
+
profile=args.profile,
|
|
71
|
+
runtime_context=runtime_context,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
judge = build_judge()
|
|
75
|
+
result = evaluate_case_sync(
|
|
76
|
+
base_url=args.base_url,
|
|
77
|
+
request=request,
|
|
78
|
+
judge=judge,
|
|
79
|
+
access_token=args.access_token,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
render_score(result, request=request)
|
|
83
|
+
print(json.dumps(result.model_dump(), indent=2, ensure_ascii=False))
|
|
84
|
+
|
|
85
|
+
return 1 if result.outcome == "execution_error" else 0
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def main() -> int:
|
|
89
|
+
parser = build_parser()
|
|
90
|
+
args = parser.parse_args()
|
|
91
|
+
|
|
92
|
+
if args.command == "score":
|
|
93
|
+
return run_score(args)
|
|
94
|
+
|
|
95
|
+
parser.error(f"Unknown command: {args.command}")
|
|
96
|
+
return 2
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
if __name__ == "__main__":
|
|
100
|
+
raise SystemExit(main())
|
|
File without changes
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
from fred_core.common import ConfigFiles, load_configuration_with_config_files
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class JudgeProfileSettings(BaseModel):
|
|
10
|
+
api_base: str | None = None
|
|
11
|
+
api_key_env: str | None = None
|
|
12
|
+
request_timeout: int = 120
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class JudgeProfile(BaseModel):
|
|
16
|
+
profile_id: str
|
|
17
|
+
provider: str
|
|
18
|
+
model: str
|
|
19
|
+
settings: JudgeProfileSettings = JudgeProfileSettings()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class JudgeConfig(BaseModel):
|
|
23
|
+
default: str
|
|
24
|
+
profiles: list[JudgeProfile]
|
|
25
|
+
|
|
26
|
+
def active(self) -> JudgeProfile:
|
|
27
|
+
for p in self.profiles:
|
|
28
|
+
if p.profile_id == self.default:
|
|
29
|
+
return p
|
|
30
|
+
raise ValueError(
|
|
31
|
+
f"Judge profile '{self.default}' not found. "
|
|
32
|
+
f"Available: {[p.profile_id for p in self.profiles]}"
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class Configuration(BaseModel):
|
|
37
|
+
version: str = "v1"
|
|
38
|
+
judge: JudgeConfig
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def parse_configuration(config_file: str) -> Configuration:
|
|
42
|
+
import yaml
|
|
43
|
+
|
|
44
|
+
with open(config_file, encoding="utf-8") as file:
|
|
45
|
+
payload = yaml.safe_load(file)
|
|
46
|
+
|
|
47
|
+
if payload is None:
|
|
48
|
+
raise ValueError(f"Configuration file is empty: {config_file}")
|
|
49
|
+
if not isinstance(payload, dict):
|
|
50
|
+
raise ValueError(f"Configuration file must be a mapping object: {config_file}")
|
|
51
|
+
|
|
52
|
+
return Configuration.model_validate(payload)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
_config_files = ConfigFiles(logger=logging.getLogger(__name__))
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def load_configuration() -> Configuration:
|
|
59
|
+
return load_configuration_with_config_files(
|
|
60
|
+
_config_files,
|
|
61
|
+
parse_configuration,
|
|
62
|
+
)
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import httpx
|
|
4
|
+
|
|
5
|
+
from fred_deepeval_cli.core.models import (
|
|
6
|
+
EvaluationCaseRequest,
|
|
7
|
+
EvaluationCaseResult,
|
|
8
|
+
)
|
|
9
|
+
from fred_deepeval_cli.core.profiles import resolve_profile
|
|
10
|
+
from fred_deepeval_cli.core.structural_checks import build_structural_checks
|
|
11
|
+
from fred_deepeval_cli.core.scorer import score_trace
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def classify_outcome(trace: dict) -> str:
|
|
15
|
+
if trace.get("error"):
|
|
16
|
+
return "execution_error"
|
|
17
|
+
if any(step.get("kind") == "awaiting_human" for step in trace.get("steps", [])):
|
|
18
|
+
return "hitl_blocked"
|
|
19
|
+
if any(step.get("kind") == "node_error" for step in trace.get("steps", [])):
|
|
20
|
+
return "degraded"
|
|
21
|
+
if trace.get("output"):
|
|
22
|
+
return "success"
|
|
23
|
+
return "unknown"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def fetch_trace(
|
|
27
|
+
base_url: str,
|
|
28
|
+
request: EvaluationCaseRequest,
|
|
29
|
+
access_token: str | None = None,
|
|
30
|
+
) -> dict:
|
|
31
|
+
headers = {"Content-Type": "application/json"}
|
|
32
|
+
if access_token:
|
|
33
|
+
headers["Authorization"] = f"Bearer {access_token}"
|
|
34
|
+
|
|
35
|
+
payload = {
|
|
36
|
+
"agent_id": request.agent_id,
|
|
37
|
+
"input": request.input,
|
|
38
|
+
"session_id": request.session_id,
|
|
39
|
+
"runtime_context": request.runtime_context,
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
with httpx.Client(timeout=httpx.Timeout(30.0, connect=5.0, read=None)) as client:
|
|
43
|
+
response = client.post(
|
|
44
|
+
f"{base_url.rstrip('/')}/agents/evaluate",
|
|
45
|
+
json=payload,
|
|
46
|
+
headers=headers,
|
|
47
|
+
)
|
|
48
|
+
response.raise_for_status()
|
|
49
|
+
result = response.json()
|
|
50
|
+
if not isinstance(result, dict):
|
|
51
|
+
raise RuntimeError("Evaluate response must be a JSON object.")
|
|
52
|
+
return result
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def evaluate_case_sync(
|
|
56
|
+
base_url: str,
|
|
57
|
+
request: EvaluationCaseRequest,
|
|
58
|
+
judge=None,
|
|
59
|
+
access_token: str | None = None,
|
|
60
|
+
) -> EvaluationCaseResult:
|
|
61
|
+
try:
|
|
62
|
+
trace = fetch_trace(base_url, request, access_token=access_token)
|
|
63
|
+
except Exception as e:
|
|
64
|
+
return EvaluationCaseResult(
|
|
65
|
+
outcome="execution_error",
|
|
66
|
+
profile=request.profile,
|
|
67
|
+
structural_checks=[],
|
|
68
|
+
metrics=[],
|
|
69
|
+
execution_error=str(e),
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
outcome = classify_outcome(trace)
|
|
73
|
+
profile = resolve_profile(trace, explicit_profile=request.profile)
|
|
74
|
+
structural_checks = build_structural_checks(trace, profile=profile)
|
|
75
|
+
|
|
76
|
+
metrics, scoring_errors = [], []
|
|
77
|
+
if judge is not None:
|
|
78
|
+
metrics, scoring_errors = score_trace(
|
|
79
|
+
trace,
|
|
80
|
+
profile=profile,
|
|
81
|
+
expected_output=request.expected_output,
|
|
82
|
+
judge=judge,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
return EvaluationCaseResult(
|
|
86
|
+
outcome=outcome,
|
|
87
|
+
profile=profile,
|
|
88
|
+
structural_checks=structural_checks,
|
|
89
|
+
metrics=metrics,
|
|
90
|
+
actual_output=trace.get("output"),
|
|
91
|
+
latency_ms=trace.get("latency_ms"),
|
|
92
|
+
execution_error=trace.get("error"),
|
|
93
|
+
scoring_errors=scoring_errors,
|
|
94
|
+
)
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
from fred_deepeval_cli.core.config_loader import load_configuration
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def build_judge(config=None):
|
|
9
|
+
from deepeval.models.llms import GPTModel, LiteLLMModel
|
|
10
|
+
|
|
11
|
+
if config is None:
|
|
12
|
+
config = load_configuration()
|
|
13
|
+
|
|
14
|
+
profile = config.judge.active()
|
|
15
|
+
provider = profile.provider
|
|
16
|
+
model_name = profile.model
|
|
17
|
+
settings = profile.settings
|
|
18
|
+
|
|
19
|
+
if provider == "litellm":
|
|
20
|
+
api_key_env = settings.api_key_env or "LITELLM_API_KEY"
|
|
21
|
+
api_key = os.environ.get(api_key_env)
|
|
22
|
+
if not api_key:
|
|
23
|
+
raise RuntimeError(
|
|
24
|
+
f"Missing {api_key_env} in environment/.env for the litellm judge."
|
|
25
|
+
)
|
|
26
|
+
return LiteLLMModel(
|
|
27
|
+
model=model_name,
|
|
28
|
+
api_key=api_key,
|
|
29
|
+
base_url=settings.api_base,
|
|
30
|
+
request_timeout=settings.request_timeout,
|
|
31
|
+
num_retries=0,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
if provider == "ollama":
|
|
35
|
+
return LiteLLMModel(
|
|
36
|
+
model=f"ollama/{model_name}",
|
|
37
|
+
api_key="ollama",
|
|
38
|
+
base_url=settings.api_base or "http://localhost:11434",
|
|
39
|
+
request_timeout=settings.request_timeout,
|
|
40
|
+
num_retries=0,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
if provider == "openai":
|
|
44
|
+
api_key_env = settings.api_key_env or "OPENAI_API_KEY"
|
|
45
|
+
api_key = os.environ.get(api_key_env)
|
|
46
|
+
if not api_key:
|
|
47
|
+
raise RuntimeError(
|
|
48
|
+
f"Missing {api_key_env} in environment/.env for the openai judge."
|
|
49
|
+
)
|
|
50
|
+
return GPTModel(model=model_name)
|
|
51
|
+
|
|
52
|
+
raise ValueError(f"Unsupported judge provider: {provider}")
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Literal
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class EvaluationMetricResult(BaseModel):
|
|
8
|
+
name: str
|
|
9
|
+
provider: str
|
|
10
|
+
score: float | None
|
|
11
|
+
threshold: float | None = None
|
|
12
|
+
verdict: Literal["passed", "failed", "skipped", "error"]
|
|
13
|
+
explanation: str | None = None
|
|
14
|
+
error: str | None = None
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class StructuralCheckResult(BaseModel):
|
|
18
|
+
name: str
|
|
19
|
+
passed: bool
|
|
20
|
+
detail: str | None = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class EvaluationCaseRequest(BaseModel):
|
|
24
|
+
agent_id: str
|
|
25
|
+
input: str
|
|
26
|
+
session_id: str
|
|
27
|
+
expected_output: str | None = None
|
|
28
|
+
profile: str = "auto"
|
|
29
|
+
runtime_context: dict = {}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class EvaluationCaseResult(BaseModel):
|
|
33
|
+
schema_version: Literal["1"] = "1"
|
|
34
|
+
outcome: Literal["success", "execution_error", "degraded", "hitl_blocked", "unknown"]
|
|
35
|
+
profile: str
|
|
36
|
+
structural_checks: list[StructuralCheckResult]
|
|
37
|
+
metrics: list[EvaluationMetricResult]
|
|
38
|
+
latency_ms: int | None = None
|
|
39
|
+
actual_output: str | None = None
|
|
40
|
+
execution_error: str | None = None
|
|
41
|
+
scoring_errors: list[str] = []
|