fred-deepeval-cli 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. fred_deepeval_cli-0.1.0/PKG-INFO +70 -0
  2. fred_deepeval_cli-0.1.0/README.md +46 -0
  3. fred_deepeval_cli-0.1.0/fred_deepeval_cli/__init__.py +0 -0
  4. fred_deepeval_cli-0.1.0/fred_deepeval_cli/cli/__init__.py +0 -0
  5. fred_deepeval_cli-0.1.0/fred_deepeval_cli/cli/display.py +208 -0
  6. fred_deepeval_cli-0.1.0/fred_deepeval_cli/cli/main.py +100 -0
  7. fred_deepeval_cli-0.1.0/fred_deepeval_cli/core/__init__.py +0 -0
  8. fred_deepeval_cli-0.1.0/fred_deepeval_cli/core/config_loader.py +62 -0
  9. fred_deepeval_cli-0.1.0/fred_deepeval_cli/core/evaluator.py +94 -0
  10. fred_deepeval_cli-0.1.0/fred_deepeval_cli/core/judge_factory.py +52 -0
  11. fred_deepeval_cli-0.1.0/fred_deepeval_cli/core/models.py +41 -0
  12. fred_deepeval_cli-0.1.0/fred_deepeval_cli/core/profiles.py +21 -0
  13. fred_deepeval_cli-0.1.0/fred_deepeval_cli/core/scorer.py +74 -0
  14. fred_deepeval_cli-0.1.0/fred_deepeval_cli/core/structural_checks.py +55 -0
  15. fred_deepeval_cli-0.1.0/fred_deepeval_cli/dataset_workflow.py +115 -0
  16. fred_deepeval_cli-0.1.0/fred_deepeval_cli/test_helpers.py +44 -0
  17. fred_deepeval_cli-0.1.0/fred_deepeval_cli/worker_adapter.py +22 -0
  18. fred_deepeval_cli-0.1.0/fred_deepeval_cli.egg-info/PKG-INFO +70 -0
  19. fred_deepeval_cli-0.1.0/fred_deepeval_cli.egg-info/SOURCES.txt +33 -0
  20. fred_deepeval_cli-0.1.0/fred_deepeval_cli.egg-info/dependency_links.txt +1 -0
  21. fred_deepeval_cli-0.1.0/fred_deepeval_cli.egg-info/entry_points.txt +2 -0
  22. fred_deepeval_cli-0.1.0/fred_deepeval_cli.egg-info/requires.txt +18 -0
  23. fred_deepeval_cli-0.1.0/fred_deepeval_cli.egg-info/top_level.txt +1 -0
  24. fred_deepeval_cli-0.1.0/pyproject.toml +45 -0
  25. fred_deepeval_cli-0.1.0/setup.cfg +4 -0
  26. fred_deepeval_cli-0.1.0/tests/test_classify.py +33 -0
  27. fred_deepeval_cli-0.1.0/tests/test_cli_parser.py +38 -0
  28. fred_deepeval_cli-0.1.0/tests/test_deepeval_adapter.py +16 -0
  29. fred_deepeval_cli-0.1.0/tests/test_deepeval_runner.py +65 -0
  30. fred_deepeval_cli-0.1.0/tests/test_eval_client.py +26 -0
  31. fred_deepeval_cli-0.1.0/tests/test_main.py +86 -0
  32. fred_deepeval_cli-0.1.0/tests/test_preset_resolver.py +23 -0
  33. fred_deepeval_cli-0.1.0/tests/test_run_sql_scenarios.py +8 -0
  34. fred_deepeval_cli-0.1.0/tests/test_sql_structural_checks.py +60 -0
  35. fred_deepeval_cli-0.1.0/tests/test_structural_checks.py +42 -0
@@ -0,0 +1,70 @@
1
+ Metadata-Version: 2.4
2
+ Name: fred-deepeval-cli
3
+ Version: 0.1.0
4
+ Summary: External CLI for evaluating Fred agent turns via /agents/evaluate
5
+ License: Apache-2.0
6
+ Requires-Python: <3.13,>=3.12
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: fred-sdk>=2.0.7
9
+ Requires-Dist: fred-runtime>=2.0.8
10
+ Provides-Extra: dev
11
+ Requires-Dist: bandit>=1.8.6; extra == "dev"
12
+ Requires-Dist: basedpyright==1.31.0; extra == "dev"
13
+ Requires-Dist: detect-secrets>=1.5.0; extra == "dev"
14
+ Requires-Dist: pytest>=8.4.2; extra == "dev"
15
+ Requires-Dist: pytest-cov>=6.2.1; extra == "dev"
16
+ Requires-Dist: pytest-socket>=0.7.0; extra == "dev"
17
+ Requires-Dist: ruff>=0.12.5; extra == "dev"
18
+ Provides-Extra: eval
19
+ Requires-Dist: deepeval; extra == "eval"
20
+ Requires-Dist: litellm; extra == "eval"
21
+ Requires-Dist: python-dotenv; extra == "eval"
22
+ Requires-Dist: rich>=13.0; extra == "eval"
23
+ Requires-Dist: temporalio; extra == "eval"
24
+
25
+ # fred-deepeval-cli
26
+
27
+ External CLI for evaluating one Fred agent turn through `POST /agents/evaluate`.
28
+
29
+ ## Purpose
30
+
31
+ This project provides a small external CLI that:
32
+ - calls a Fred pod `/agents/evaluate` endpoint
33
+ - receives an `EvalTrace`
34
+ - classifies the turn outcome
35
+ - resolves an evaluation preset from `agent_tags`
36
+ - computes structural checks
37
+ - scores the trace with DeepEval
38
+
39
+ ## Commands
40
+
41
+ ```bash
42
+ make dev
43
+ make eval-dev
44
+ make test
45
+ make code-quality
46
+ make cli
47
+ make score BASE_URL=http://127.0.0.1:8000/fred/agents/v2 AGENT_ID=fred.test.assistant INPUT="echo bonjour" SESSION_ID=eval-001 USER_ID=alice
48
+ make sql-scenarios BASE_URL=http://127.0.0.1:8000/fred/agents/v2
49
+
50
+ ## Documentation
51
+
52
+ | Topic | File |
53
+ | --- | --- |
54
+ | Evaluate any Fred agent pod | `docs/evaluating-any-fred-agent.md` |
55
+ | RAG evaluation — approach and metrics | `docs/rag-evaluation-rfc.md` |
56
+ | RAG local setup guide | `docs/rag-local-setup.md` |
57
+ | SQL evaluation | `docs/sql-evaluation.md` |
58
+ | OTel export strategy | `fred/docs/swift/rfc/AGENT-EVALUATION-RFC.md §13` |
59
+
60
+ ## Architecture — EVAL-01 Phase 1
61
+
62
+ This CLI is being restructured into a reusable library core so the Fred
63
+ Control Plane evaluation worker can call it directly without spawning a subprocess.
64
+
65
+ - `fred_deepeval_cli/core/` — callable library (models, evaluator, profiles, scorer, judge factory)
66
+ - `fred_deepeval_cli/cli/` — thin CLI adapter over the core
67
+ - `fred_deepeval_cli/worker_adapter.py` — public entrypoint for the Control Plane worker
68
+
69
+ The CLI interface and JSON output remain unchanged.
70
+ See EVAL-01 Phase 1 issue and `fred/docs/swift/rfc/AGENT-EVALUATION-RFC.md §7.3`.
@@ -0,0 +1,46 @@
1
+ # fred-deepeval-cli
2
+
3
+ External CLI for evaluating one Fred agent turn through `POST /agents/evaluate`.
4
+
5
+ ## Purpose
6
+
7
+ This project provides a small external CLI that:
8
+ - calls a Fred pod `/agents/evaluate` endpoint
9
+ - receives an `EvalTrace`
10
+ - classifies the turn outcome
11
+ - resolves an evaluation preset from `agent_tags`
12
+ - computes structural checks
13
+ - scores the trace with DeepEval
14
+
15
+ ## Commands
16
+
17
+ ```bash
18
+ make dev
19
+ make eval-dev
20
+ make test
21
+ make code-quality
22
+ make cli
23
+ make score BASE_URL=http://127.0.0.1:8000/fred/agents/v2 AGENT_ID=fred.test.assistant INPUT="echo bonjour" SESSION_ID=eval-001 USER_ID=alice
24
+ make sql-scenarios BASE_URL=http://127.0.0.1:8000/fred/agents/v2
25
+
26
+ ## Documentation
27
+
28
+ | Topic | File |
29
+ | --- | --- |
30
+ | Evaluate any Fred agent pod | `docs/evaluating-any-fred-agent.md` |
31
+ | RAG evaluation — approach and metrics | `docs/rag-evaluation-rfc.md` |
32
+ | RAG local setup guide | `docs/rag-local-setup.md` |
33
+ | SQL evaluation | `docs/sql-evaluation.md` |
34
+ | OTel export strategy | `fred/docs/swift/rfc/AGENT-EVALUATION-RFC.md §13` |
35
+
36
+ ## Architecture — EVAL-01 Phase 1
37
+
38
+ This CLI is being restructured into a reusable library core so the Fred
39
+ Control Plane evaluation worker can call it directly without spawning a subprocess.
40
+
41
+ - `fred_deepeval_cli/core/` — callable library (models, evaluator, profiles, scorer, judge factory)
42
+ - `fred_deepeval_cli/cli/` — thin CLI adapter over the core
43
+ - `fred_deepeval_cli/worker_adapter.py` — public entrypoint for the Control Plane worker
44
+
45
+ The CLI interface and JSON output remain unchanged.
46
+ See EVAL-01 Phase 1 issue and `fred/docs/swift/rfc/AGENT-EVALUATION-RFC.md §7.3`.
File without changes
@@ -0,0 +1,208 @@
1
+ from __future__ import annotations
2
+
3
+ from rich.console import Console
4
+ from rich.panel import Panel
5
+ from rich.table import Table
6
+ from rich import box
7
+ from rich.text import Text
8
+
9
+ from fred_deepeval_cli.core.models import EvaluationCaseRequest, EvaluationCaseResult
10
+
11
+ console = Console(stderr=True)
12
+
13
+
14
+ def _check_icon(value: object) -> str:
15
+ return "✅" if value is True else "❌"
16
+
17
+
18
+ def _outcome_text(outcome: str) -> Text:
19
+ if outcome == "execution_error":
20
+ return Text(f" {outcome}", style="bold red")
21
+ return Text(f" {outcome}", style="bold green")
22
+
23
+
24
+ def render_score(
25
+ result: EvaluationCaseResult,
26
+ request: EvaluationCaseRequest | None = None,
27
+ ) -> None:
28
+ # ── Header ──────────────────────────────────────────────────────────────
29
+ header = Table.grid(padding=(0, 2))
30
+ header.add_column(style="bold cyan")
31
+ header.add_column()
32
+ if request:
33
+ header.add_row("Agent", request.agent_id)
34
+ header.add_row("Session", request.session_id)
35
+ header.add_row("Input", request.input)
36
+ header.add_row("Profile", result.profile)
37
+
38
+ console.print()
39
+ console.print(Panel(header, title="[bold]fred-deepeval-cli[/bold]", border_style="cyan"))
40
+
41
+ # ── Output agent ────────────────────────────────────────────────────────
42
+ agent_output = result.actual_output or "—"
43
+ console.print(Panel(agent_output, title="Output", border_style="yellow"))
44
+
45
+ # ── Outcome ─────────────────────────────────────────────────────────────
46
+ console.print(Panel(
47
+ _outcome_text(result.outcome),
48
+ title="Outcome",
49
+ border_style="green" if result.outcome != "execution_error" else "red",
50
+ ))
51
+
52
+ # ── Structural Checks ───────────────────────────────────────────────────
53
+ if result.structural_checks:
54
+ table = Table(box=box.SIMPLE, show_header=True, header_style="bold magenta")
55
+ table.add_column("Check", style="cyan")
56
+ table.add_column("", justify="center")
57
+
58
+ for check in result.structural_checks:
59
+ table.add_row(check.name, _check_icon(check.passed))
60
+
61
+ console.print(Panel(table, title=f"Structural Checks [{result.profile}]", border_style="magenta"))
62
+
63
+ # ── DeepEval Metrics ────────────────────────────────────────────────────
64
+ if result.metrics:
65
+ table = Table(box=box.SIMPLE, show_header=True, header_style="bold blue")
66
+ table.add_column("Metric", style="cyan")
67
+ table.add_column("Score", justify="right")
68
+ table.add_column("", justify="center")
69
+ table.add_column("Reason", style="dim", no_wrap=False, max_width=60)
70
+
71
+ for m in result.metrics:
72
+ score_str = f"{m.score:.2f}" if isinstance(m.score, float) else "—"
73
+ icon = "✅" if m.verdict == "passed" else ("⏭" if m.verdict == "skipped" else "❌")
74
+ table.add_row(m.name, score_str, icon, m.explanation or m.error or "—")
75
+
76
+ console.print(Panel(table, title="DeepEval Metrics", border_style="blue"))
77
+
78
+ # ── Erreurs ─────────────────────────────────────────────────────────────
79
+ if result.scoring_errors:
80
+ console.print(Panel(
81
+ "\n".join(result.scoring_errors),
82
+ title="Scoring Errors",
83
+ border_style="red",
84
+ ))
85
+
86
+ console.print()
87
+
88
+
89
+ # ── Campagne ────────────────────────────────────────────────────────────────
90
+
91
+ _CAMPAIGN_METRICS = [
92
+ "AnswerRelevancyMetric",
93
+ "FaithfulnessMetric",
94
+ "ContextualRelevancyMetric",
95
+ "ContextualPrecisionMetric",
96
+ "ContextualRecallMetric",
97
+ ]
98
+
99
+
100
+ def _fmt_score(metrics_by_name: dict, name: str, totals: dict) -> str:
101
+ m = metrics_by_name.get(name)
102
+ if m is None:
103
+ return "—"
104
+ score = m.get("score")
105
+ if score is None:
106
+ return "—"
107
+ totals[name].append(score)
108
+ icon = "✅" if m.get("verdict") == "passed" else "❌"
109
+ return f"{score:.2f}{icon}"
110
+
111
+
112
+ def render_campaign(results: list[dict]) -> None:
113
+ """Affiche le tableau récapitulatif d'une campagne RAG."""
114
+ totals: dict[str, list[float]] = {m: [] for m in _CAMPAIGN_METRICS}
115
+
116
+ table = Table(box=box.SIMPLE, show_header=True, header_style="bold cyan")
117
+ table.add_column("ID", style="dim", width=22)
118
+ table.add_column("Outcome", width=10)
119
+ table.add_column("RAG", justify="center", width=5)
120
+ table.add_column("AnswerRel", justify="right", width=10)
121
+ table.add_column("Faithful", justify="right", width=10)
122
+ table.add_column("CtxRel", justify="right", width=8)
123
+ table.add_column("CtxPrec", justify="right", width=9)
124
+ table.add_column("CtxRecall", justify="right", width=10)
125
+
126
+ for r in results:
127
+ raw_metrics = r.get("metrics", {})
128
+ metrics_by_name = raw_metrics if isinstance(raw_metrics, dict) else {m["name"]: m for m in raw_metrics}
129
+ table.add_row(
130
+ r["id"],
131
+ r["outcome"],
132
+ "✅" if r.get("rag_ok") else "❌",
133
+ _fmt_score(metrics_by_name, "AnswerRelevancyMetric", totals),
134
+ _fmt_score(metrics_by_name, "FaithfulnessMetric", totals),
135
+ _fmt_score(metrics_by_name, "ContextualRelevancyMetric", totals),
136
+ _fmt_score(metrics_by_name, "ContextualPrecisionMetric", totals),
137
+ _fmt_score(metrics_by_name, "ContextualRecallMetric", totals),
138
+ )
139
+
140
+ console.print()
141
+ console.print(Panel(table, title="Résultats par scénario", border_style="cyan"))
142
+
143
+ # ── Moyennes ─────────────────────────────────────────────────────────────
144
+ avg_table = Table(box=box.SIMPLE, show_header=True, header_style="bold blue")
145
+ avg_table.add_column("Métrique", style="cyan")
146
+ avg_table.add_column("Moyenne", justify="right")
147
+ avg_table.add_column("N", justify="right", style="dim")
148
+
149
+ overall: list[float] = []
150
+ for name in _CAMPAIGN_METRICS:
151
+ scores = totals[name]
152
+ if scores:
153
+ avg = sum(scores) / len(scores)
154
+ overall.append(avg)
155
+ avg_table.add_row(name, f"{avg:.4f} ({avg * 100:.1f}%)", str(len(scores)))
156
+ else:
157
+ avg_table.add_row(name, "—", "0")
158
+
159
+ if overall:
160
+ global_avg = sum(overall) / len(overall)
161
+ avg_table.add_row(
162
+ "OVERALL",
163
+ f"{global_avg:.4f} ({global_avg * 100:.1f}%)",
164
+ "",
165
+ style="bold",
166
+ )
167
+
168
+ console.print(Panel(avg_table, title="Moyennes par métrique", border_style="blue"))
169
+ console.print()
170
+
171
+
172
+ def render_sql_campaign(results: list[dict]) -> None:
173
+ """Affiche le tableau récapitulatif d'une campagne SQL."""
174
+ table = Table(box=box.SIMPLE, show_header=True, header_style="bold cyan")
175
+ table.add_column("ID", style="dim", width=22)
176
+ table.add_column("Outcome", width=12)
177
+ table.add_column("Query exec", justify="center", width=12)
178
+ table.add_column("No error", justify="center", width=10)
179
+ table.add_column("Pass", justify="center", width=6)
180
+ table.add_column("Failures", style="dim", no_wrap=False, max_width=50)
181
+
182
+ passed = 0
183
+ for r in results:
184
+ checks = r.get("observed_checks", {})
185
+ failures = r.get("failures", [])
186
+ is_pass = r.get("pass", False)
187
+ if is_pass:
188
+ passed += 1
189
+
190
+ table.add_row(
191
+ r["id"],
192
+ r["outcome"],
193
+ "✅" if checks.get("sql_query_executed") else "❌",
194
+ "✅" if checks.get("sql_no_execution_error") else "❌",
195
+ "✅" if is_pass else "❌",
196
+ " | ".join(failures) if failures else "—",
197
+ )
198
+
199
+ console.print()
200
+ console.print(Panel(table, title="Résultats SQL par scénario", border_style="cyan"))
201
+
202
+ total = len(results)
203
+ color = "green" if passed == total else "yellow" if passed > 0 else "red"
204
+ console.print(Panel(
205
+ f"[bold {color}]{passed}/{total} scénarios passés[/bold {color}]",
206
+ border_style=color,
207
+ ))
208
+ console.print()
@@ -0,0 +1,100 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import os
6
+
7
+ from fred_deepeval_cli.core.models import EvaluationCaseRequest
8
+ from fred_deepeval_cli.core.evaluator import evaluate_case_sync
9
+ from fred_deepeval_cli.core.judge_factory import build_judge
10
+ from fred_deepeval_cli.cli.display import render_score
11
+ from dotenv import load_dotenv
12
+
13
+ dotenv_path = os.getenv("ENV_FILE", "./config/.env")
14
+ load_dotenv(dotenv_path)
15
+
16
+
17
+ def build_parser() -> argparse.ArgumentParser:
18
+ parser = argparse.ArgumentParser(
19
+ prog="fred-deepeval-cli",
20
+ description="External CLI for evaluating Fred agent turns.",
21
+ )
22
+
23
+ subparsers = parser.add_subparsers(dest="command", required=True)
24
+
25
+ score_parser = subparsers.add_parser(
26
+ "score",
27
+ help="Evaluate one Fred agent turn and score it with DeepEval.",
28
+ )
29
+ add_shared_eval_args(score_parser)
30
+
31
+ return parser
32
+
33
+
34
+ def add_shared_eval_args(parser: argparse.ArgumentParser) -> None:
35
+ parser.add_argument("--base-url", required=True, help="Fred pod base URL.")
36
+ parser.add_argument("--agent-id", required=True, help="Agent identifier.")
37
+ parser.add_argument("--input", required=True, help="User input to evaluate.")
38
+ parser.add_argument("--session-id", required=True, help="Session identifier.")
39
+ parser.add_argument("--user-id", required=True, help="Runtime user identifier.")
40
+ parser.add_argument("--team-id", help="Optional runtime team identifier.")
41
+ parser.add_argument(
42
+ "--access-token",
43
+ default=os.environ.get("FRED_ACCESS_TOKEN"),
44
+ help="Optional bearer token for authenticated agent evaluation.",
45
+ )
46
+ parser.add_argument(
47
+ "--search-policy",
48
+ default=os.environ.get("FRED_SEARCH_POLICY"),
49
+ help="Optional runtime search policy override (for example: semantic).",
50
+ )
51
+ parser.add_argument(
52
+ "--profile",
53
+ default="auto",
54
+ choices=["auto", "rag", "sql", "workflow", "default"],
55
+ help="Evaluation profile. Defaults to auto-detection from agent_tags.",
56
+ )
57
+
58
+
59
+ def run_score(args: argparse.Namespace) -> int:
60
+ runtime_context: dict = {"user_id": args.user_id}
61
+ if args.team_id:
62
+ runtime_context["team_id"] = args.team_id
63
+ if args.search_policy:
64
+ runtime_context["search_policy"] = args.search_policy
65
+
66
+ request = EvaluationCaseRequest(
67
+ agent_id=args.agent_id,
68
+ input=args.input,
69
+ session_id=args.session_id,
70
+ profile=args.profile,
71
+ runtime_context=runtime_context,
72
+ )
73
+
74
+ judge = build_judge()
75
+ result = evaluate_case_sync(
76
+ base_url=args.base_url,
77
+ request=request,
78
+ judge=judge,
79
+ access_token=args.access_token,
80
+ )
81
+
82
+ render_score(result, request=request)
83
+ print(json.dumps(result.model_dump(), indent=2, ensure_ascii=False))
84
+
85
+ return 1 if result.outcome == "execution_error" else 0
86
+
87
+
88
+ def main() -> int:
89
+ parser = build_parser()
90
+ args = parser.parse_args()
91
+
92
+ if args.command == "score":
93
+ return run_score(args)
94
+
95
+ parser.error(f"Unknown command: {args.command}")
96
+ return 2
97
+
98
+
99
+ if __name__ == "__main__":
100
+ raise SystemExit(main())
@@ -0,0 +1,62 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+
5
+ from fred_core.common import ConfigFiles, load_configuration_with_config_files
6
+ from pydantic import BaseModel
7
+
8
+
9
+ class JudgeProfileSettings(BaseModel):
10
+ api_base: str | None = None
11
+ api_key_env: str | None = None
12
+ request_timeout: int = 120
13
+
14
+
15
+ class JudgeProfile(BaseModel):
16
+ profile_id: str
17
+ provider: str
18
+ model: str
19
+ settings: JudgeProfileSettings = JudgeProfileSettings()
20
+
21
+
22
+ class JudgeConfig(BaseModel):
23
+ default: str
24
+ profiles: list[JudgeProfile]
25
+
26
+ def active(self) -> JudgeProfile:
27
+ for p in self.profiles:
28
+ if p.profile_id == self.default:
29
+ return p
30
+ raise ValueError(
31
+ f"Judge profile '{self.default}' not found. "
32
+ f"Available: {[p.profile_id for p in self.profiles]}"
33
+ )
34
+
35
+
36
+ class Configuration(BaseModel):
37
+ version: str = "v1"
38
+ judge: JudgeConfig
39
+
40
+
41
+ def parse_configuration(config_file: str) -> Configuration:
42
+ import yaml
43
+
44
+ with open(config_file, encoding="utf-8") as file:
45
+ payload = yaml.safe_load(file)
46
+
47
+ if payload is None:
48
+ raise ValueError(f"Configuration file is empty: {config_file}")
49
+ if not isinstance(payload, dict):
50
+ raise ValueError(f"Configuration file must be a mapping object: {config_file}")
51
+
52
+ return Configuration.model_validate(payload)
53
+
54
+
55
+ _config_files = ConfigFiles(logger=logging.getLogger(__name__))
56
+
57
+
58
+ def load_configuration() -> Configuration:
59
+ return load_configuration_with_config_files(
60
+ _config_files,
61
+ parse_configuration,
62
+ )
@@ -0,0 +1,94 @@
1
+ from __future__ import annotations
2
+
3
+ import httpx
4
+
5
+ from fred_deepeval_cli.core.models import (
6
+ EvaluationCaseRequest,
7
+ EvaluationCaseResult,
8
+ )
9
+ from fred_deepeval_cli.core.profiles import resolve_profile
10
+ from fred_deepeval_cli.core.structural_checks import build_structural_checks
11
+ from fred_deepeval_cli.core.scorer import score_trace
12
+
13
+
14
+ def classify_outcome(trace: dict) -> str:
15
+ if trace.get("error"):
16
+ return "execution_error"
17
+ if any(step.get("kind") == "awaiting_human" for step in trace.get("steps", [])):
18
+ return "hitl_blocked"
19
+ if any(step.get("kind") == "node_error" for step in trace.get("steps", [])):
20
+ return "degraded"
21
+ if trace.get("output"):
22
+ return "success"
23
+ return "unknown"
24
+
25
+
26
+ def fetch_trace(
27
+ base_url: str,
28
+ request: EvaluationCaseRequest,
29
+ access_token: str | None = None,
30
+ ) -> dict:
31
+ headers = {"Content-Type": "application/json"}
32
+ if access_token:
33
+ headers["Authorization"] = f"Bearer {access_token}"
34
+
35
+ payload = {
36
+ "agent_id": request.agent_id,
37
+ "input": request.input,
38
+ "session_id": request.session_id,
39
+ "runtime_context": request.runtime_context,
40
+ }
41
+
42
+ with httpx.Client(timeout=httpx.Timeout(30.0, connect=5.0, read=None)) as client:
43
+ response = client.post(
44
+ f"{base_url.rstrip('/')}/agents/evaluate",
45
+ json=payload,
46
+ headers=headers,
47
+ )
48
+ response.raise_for_status()
49
+ result = response.json()
50
+ if not isinstance(result, dict):
51
+ raise RuntimeError("Evaluate response must be a JSON object.")
52
+ return result
53
+
54
+
55
+ def evaluate_case_sync(
56
+ base_url: str,
57
+ request: EvaluationCaseRequest,
58
+ judge=None,
59
+ access_token: str | None = None,
60
+ ) -> EvaluationCaseResult:
61
+ try:
62
+ trace = fetch_trace(base_url, request, access_token=access_token)
63
+ except Exception as e:
64
+ return EvaluationCaseResult(
65
+ outcome="execution_error",
66
+ profile=request.profile,
67
+ structural_checks=[],
68
+ metrics=[],
69
+ execution_error=str(e),
70
+ )
71
+
72
+ outcome = classify_outcome(trace)
73
+ profile = resolve_profile(trace, explicit_profile=request.profile)
74
+ structural_checks = build_structural_checks(trace, profile=profile)
75
+
76
+ metrics, scoring_errors = [], []
77
+ if judge is not None:
78
+ metrics, scoring_errors = score_trace(
79
+ trace,
80
+ profile=profile,
81
+ expected_output=request.expected_output,
82
+ judge=judge,
83
+ )
84
+
85
+ return EvaluationCaseResult(
86
+ outcome=outcome,
87
+ profile=profile,
88
+ structural_checks=structural_checks,
89
+ metrics=metrics,
90
+ actual_output=trace.get("output"),
91
+ latency_ms=trace.get("latency_ms"),
92
+ execution_error=trace.get("error"),
93
+ scoring_errors=scoring_errors,
94
+ )
@@ -0,0 +1,52 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+
5
+ from fred_deepeval_cli.core.config_loader import load_configuration
6
+
7
+
8
+ def build_judge(config=None):
9
+ from deepeval.models.llms import GPTModel, LiteLLMModel
10
+
11
+ if config is None:
12
+ config = load_configuration()
13
+
14
+ profile = config.judge.active()
15
+ provider = profile.provider
16
+ model_name = profile.model
17
+ settings = profile.settings
18
+
19
+ if provider == "litellm":
20
+ api_key_env = settings.api_key_env or "LITELLM_API_KEY"
21
+ api_key = os.environ.get(api_key_env)
22
+ if not api_key:
23
+ raise RuntimeError(
24
+ f"Missing {api_key_env} in environment/.env for the litellm judge."
25
+ )
26
+ return LiteLLMModel(
27
+ model=model_name,
28
+ api_key=api_key,
29
+ base_url=settings.api_base,
30
+ request_timeout=settings.request_timeout,
31
+ num_retries=0,
32
+ )
33
+
34
+ if provider == "ollama":
35
+ return LiteLLMModel(
36
+ model=f"ollama/{model_name}",
37
+ api_key="ollama",
38
+ base_url=settings.api_base or "http://localhost:11434",
39
+ request_timeout=settings.request_timeout,
40
+ num_retries=0,
41
+ )
42
+
43
+ if provider == "openai":
44
+ api_key_env = settings.api_key_env or "OPENAI_API_KEY"
45
+ api_key = os.environ.get(api_key_env)
46
+ if not api_key:
47
+ raise RuntimeError(
48
+ f"Missing {api_key_env} in environment/.env for the openai judge."
49
+ )
50
+ return GPTModel(model=model_name)
51
+
52
+ raise ValueError(f"Unsupported judge provider: {provider}")
@@ -0,0 +1,41 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Literal
4
+ from pydantic import BaseModel
5
+
6
+
7
+ class EvaluationMetricResult(BaseModel):
8
+ name: str
9
+ provider: str
10
+ score: float | None
11
+ threshold: float | None = None
12
+ verdict: Literal["passed", "failed", "skipped", "error"]
13
+ explanation: str | None = None
14
+ error: str | None = None
15
+
16
+
17
+ class StructuralCheckResult(BaseModel):
18
+ name: str
19
+ passed: bool
20
+ detail: str | None = None
21
+
22
+
23
+ class EvaluationCaseRequest(BaseModel):
24
+ agent_id: str
25
+ input: str
26
+ session_id: str
27
+ expected_output: str | None = None
28
+ profile: str = "auto"
29
+ runtime_context: dict = {}
30
+
31
+
32
+ class EvaluationCaseResult(BaseModel):
33
+ schema_version: Literal["1"] = "1"
34
+ outcome: Literal["success", "execution_error", "degraded", "hitl_blocked", "unknown"]
35
+ profile: str
36
+ structural_checks: list[StructuralCheckResult]
37
+ metrics: list[EvaluationMetricResult]
38
+ latency_ms: int | None = None
39
+ actual_output: str | None = None
40
+ execution_error: str | None = None
41
+ scoring_errors: list[str] = []