fred-deepeval-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
File without changes
@@ -0,0 +1,208 @@
1
+ from __future__ import annotations
2
+
3
+ from rich.console import Console
4
+ from rich.panel import Panel
5
+ from rich.table import Table
6
+ from rich import box
7
+ from rich.text import Text
8
+
9
+ from fred_deepeval_cli.core.models import EvaluationCaseRequest, EvaluationCaseResult
10
+
11
+ console = Console(stderr=True)
12
+
13
+
14
+ def _check_icon(value: object) -> str:
15
+ return "✅" if value is True else "❌"
16
+
17
+
18
+ def _outcome_text(outcome: str) -> Text:
19
+ if outcome == "execution_error":
20
+ return Text(f" {outcome}", style="bold red")
21
+ return Text(f" {outcome}", style="bold green")
22
+
23
+
24
+ def render_score(
25
+ result: EvaluationCaseResult,
26
+ request: EvaluationCaseRequest | None = None,
27
+ ) -> None:
28
+ # ── Header ──────────────────────────────────────────────────────────────
29
+ header = Table.grid(padding=(0, 2))
30
+ header.add_column(style="bold cyan")
31
+ header.add_column()
32
+ if request:
33
+ header.add_row("Agent", request.agent_id)
34
+ header.add_row("Session", request.session_id)
35
+ header.add_row("Input", request.input)
36
+ header.add_row("Profile", result.profile)
37
+
38
+ console.print()
39
+ console.print(Panel(header, title="[bold]fred-deepeval-cli[/bold]", border_style="cyan"))
40
+
41
+ # ── Output agent ────────────────────────────────────────────────────────
42
+ agent_output = result.actual_output or "—"
43
+ console.print(Panel(agent_output, title="Output", border_style="yellow"))
44
+
45
+ # ── Outcome ─────────────────────────────────────────────────────────────
46
+ console.print(Panel(
47
+ _outcome_text(result.outcome),
48
+ title="Outcome",
49
+ border_style="green" if result.outcome != "execution_error" else "red",
50
+ ))
51
+
52
+ # ── Structural Checks ───────────────────────────────────────────────────
53
+ if result.structural_checks:
54
+ table = Table(box=box.SIMPLE, show_header=True, header_style="bold magenta")
55
+ table.add_column("Check", style="cyan")
56
+ table.add_column("", justify="center")
57
+
58
+ for check in result.structural_checks:
59
+ table.add_row(check.name, _check_icon(check.passed))
60
+
61
+ console.print(Panel(table, title=f"Structural Checks [{result.profile}]", border_style="magenta"))
62
+
63
+ # ── DeepEval Metrics ────────────────────────────────────────────────────
64
+ if result.metrics:
65
+ table = Table(box=box.SIMPLE, show_header=True, header_style="bold blue")
66
+ table.add_column("Metric", style="cyan")
67
+ table.add_column("Score", justify="right")
68
+ table.add_column("", justify="center")
69
+ table.add_column("Reason", style="dim", no_wrap=False, max_width=60)
70
+
71
+ for m in result.metrics:
72
+ score_str = f"{m.score:.2f}" if isinstance(m.score, float) else "—"
73
+ icon = "✅" if m.verdict == "passed" else ("⏭" if m.verdict == "skipped" else "❌")
74
+ table.add_row(m.name, score_str, icon, m.explanation or m.error or "—")
75
+
76
+ console.print(Panel(table, title="DeepEval Metrics", border_style="blue"))
77
+
78
+ # ── Erreurs ─────────────────────────────────────────────────────────────
79
+ if result.scoring_errors:
80
+ console.print(Panel(
81
+ "\n".join(result.scoring_errors),
82
+ title="Scoring Errors",
83
+ border_style="red",
84
+ ))
85
+
86
+ console.print()
87
+
88
+
89
+ # ── Campagne ────────────────────────────────────────────────────────────────
90
+
91
+ _CAMPAIGN_METRICS = [
92
+ "AnswerRelevancyMetric",
93
+ "FaithfulnessMetric",
94
+ "ContextualRelevancyMetric",
95
+ "ContextualPrecisionMetric",
96
+ "ContextualRecallMetric",
97
+ ]
98
+
99
+
100
+ def _fmt_score(metrics_by_name: dict, name: str, totals: dict) -> str:
101
+ m = metrics_by_name.get(name)
102
+ if m is None:
103
+ return "—"
104
+ score = m.get("score")
105
+ if score is None:
106
+ return "—"
107
+ totals[name].append(score)
108
+ icon = "✅" if m.get("verdict") == "passed" else "❌"
109
+ return f"{score:.2f}{icon}"
110
+
111
+
112
+ def render_campaign(results: list[dict]) -> None:
113
+ """Affiche le tableau récapitulatif d'une campagne RAG."""
114
+ totals: dict[str, list[float]] = {m: [] for m in _CAMPAIGN_METRICS}
115
+
116
+ table = Table(box=box.SIMPLE, show_header=True, header_style="bold cyan")
117
+ table.add_column("ID", style="dim", width=22)
118
+ table.add_column("Outcome", width=10)
119
+ table.add_column("RAG", justify="center", width=5)
120
+ table.add_column("AnswerRel", justify="right", width=10)
121
+ table.add_column("Faithful", justify="right", width=10)
122
+ table.add_column("CtxRel", justify="right", width=8)
123
+ table.add_column("CtxPrec", justify="right", width=9)
124
+ table.add_column("CtxRecall", justify="right", width=10)
125
+
126
+ for r in results:
127
+ raw_metrics = r.get("metrics", {})
128
+ metrics_by_name = raw_metrics if isinstance(raw_metrics, dict) else {m["name"]: m for m in raw_metrics}
129
+ table.add_row(
130
+ r["id"],
131
+ r["outcome"],
132
+ "✅" if r.get("rag_ok") else "❌",
133
+ _fmt_score(metrics_by_name, "AnswerRelevancyMetric", totals),
134
+ _fmt_score(metrics_by_name, "FaithfulnessMetric", totals),
135
+ _fmt_score(metrics_by_name, "ContextualRelevancyMetric", totals),
136
+ _fmt_score(metrics_by_name, "ContextualPrecisionMetric", totals),
137
+ _fmt_score(metrics_by_name, "ContextualRecallMetric", totals),
138
+ )
139
+
140
+ console.print()
141
+ console.print(Panel(table, title="Résultats par scénario", border_style="cyan"))
142
+
143
+ # ── Moyennes ─────────────────────────────────────────────────────────────
144
+ avg_table = Table(box=box.SIMPLE, show_header=True, header_style="bold blue")
145
+ avg_table.add_column("Métrique", style="cyan")
146
+ avg_table.add_column("Moyenne", justify="right")
147
+ avg_table.add_column("N", justify="right", style="dim")
148
+
149
+ overall: list[float] = []
150
+ for name in _CAMPAIGN_METRICS:
151
+ scores = totals[name]
152
+ if scores:
153
+ avg = sum(scores) / len(scores)
154
+ overall.append(avg)
155
+ avg_table.add_row(name, f"{avg:.4f} ({avg * 100:.1f}%)", str(len(scores)))
156
+ else:
157
+ avg_table.add_row(name, "—", "0")
158
+
159
+ if overall:
160
+ global_avg = sum(overall) / len(overall)
161
+ avg_table.add_row(
162
+ "OVERALL",
163
+ f"{global_avg:.4f} ({global_avg * 100:.1f}%)",
164
+ "",
165
+ style="bold",
166
+ )
167
+
168
+ console.print(Panel(avg_table, title="Moyennes par métrique", border_style="blue"))
169
+ console.print()
170
+
171
+
172
+ def render_sql_campaign(results: list[dict]) -> None:
173
+ """Affiche le tableau récapitulatif d'une campagne SQL."""
174
+ table = Table(box=box.SIMPLE, show_header=True, header_style="bold cyan")
175
+ table.add_column("ID", style="dim", width=22)
176
+ table.add_column("Outcome", width=12)
177
+ table.add_column("Query exec", justify="center", width=12)
178
+ table.add_column("No error", justify="center", width=10)
179
+ table.add_column("Pass", justify="center", width=6)
180
+ table.add_column("Failures", style="dim", no_wrap=False, max_width=50)
181
+
182
+ passed = 0
183
+ for r in results:
184
+ checks = r.get("observed_checks", {})
185
+ failures = r.get("failures", [])
186
+ is_pass = r.get("pass", False)
187
+ if is_pass:
188
+ passed += 1
189
+
190
+ table.add_row(
191
+ r["id"],
192
+ r["outcome"],
193
+ "✅" if checks.get("sql_query_executed") else "❌",
194
+ "✅" if checks.get("sql_no_execution_error") else "❌",
195
+ "✅" if is_pass else "❌",
196
+ " | ".join(failures) if failures else "—",
197
+ )
198
+
199
+ console.print()
200
+ console.print(Panel(table, title="Résultats SQL par scénario", border_style="cyan"))
201
+
202
+ total = len(results)
203
+ color = "green" if passed == total else "yellow" if passed > 0 else "red"
204
+ console.print(Panel(
205
+ f"[bold {color}]{passed}/{total} scénarios passés[/bold {color}]",
206
+ border_style=color,
207
+ ))
208
+ console.print()
@@ -0,0 +1,100 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import os
6
+
7
+ from fred_deepeval_cli.core.models import EvaluationCaseRequest
8
+ from fred_deepeval_cli.core.evaluator import evaluate_case_sync
9
+ from fred_deepeval_cli.core.judge_factory import build_judge
10
+ from fred_deepeval_cli.cli.display import render_score
11
+ from dotenv import load_dotenv
12
+
13
+ dotenv_path = os.getenv("ENV_FILE", "./config/.env")
14
+ load_dotenv(dotenv_path)
15
+
16
+
17
+ def build_parser() -> argparse.ArgumentParser:
18
+ parser = argparse.ArgumentParser(
19
+ prog="fred-deepeval-cli",
20
+ description="External CLI for evaluating Fred agent turns.",
21
+ )
22
+
23
+ subparsers = parser.add_subparsers(dest="command", required=True)
24
+
25
+ score_parser = subparsers.add_parser(
26
+ "score",
27
+ help="Evaluate one Fred agent turn and score it with DeepEval.",
28
+ )
29
+ add_shared_eval_args(score_parser)
30
+
31
+ return parser
32
+
33
+
34
+ def add_shared_eval_args(parser: argparse.ArgumentParser) -> None:
35
+ parser.add_argument("--base-url", required=True, help="Fred pod base URL.")
36
+ parser.add_argument("--agent-id", required=True, help="Agent identifier.")
37
+ parser.add_argument("--input", required=True, help="User input to evaluate.")
38
+ parser.add_argument("--session-id", required=True, help="Session identifier.")
39
+ parser.add_argument("--user-id", required=True, help="Runtime user identifier.")
40
+ parser.add_argument("--team-id", help="Optional runtime team identifier.")
41
+ parser.add_argument(
42
+ "--access-token",
43
+ default=os.environ.get("FRED_ACCESS_TOKEN"),
44
+ help="Optional bearer token for authenticated agent evaluation.",
45
+ )
46
+ parser.add_argument(
47
+ "--search-policy",
48
+ default=os.environ.get("FRED_SEARCH_POLICY"),
49
+ help="Optional runtime search policy override (for example: semantic).",
50
+ )
51
+ parser.add_argument(
52
+ "--profile",
53
+ default="auto",
54
+ choices=["auto", "rag", "sql", "workflow", "default"],
55
+ help="Evaluation profile. Defaults to auto-detection from agent_tags.",
56
+ )
57
+
58
+
59
+ def run_score(args: argparse.Namespace) -> int:
60
+ runtime_context: dict = {"user_id": args.user_id}
61
+ if args.team_id:
62
+ runtime_context["team_id"] = args.team_id
63
+ if args.search_policy:
64
+ runtime_context["search_policy"] = args.search_policy
65
+
66
+ request = EvaluationCaseRequest(
67
+ agent_id=args.agent_id,
68
+ input=args.input,
69
+ session_id=args.session_id,
70
+ profile=args.profile,
71
+ runtime_context=runtime_context,
72
+ )
73
+
74
+ judge = build_judge()
75
+ result = evaluate_case_sync(
76
+ base_url=args.base_url,
77
+ request=request,
78
+ judge=judge,
79
+ access_token=args.access_token,
80
+ )
81
+
82
+ render_score(result, request=request)
83
+ print(json.dumps(result.model_dump(), indent=2, ensure_ascii=False))
84
+
85
+ return 1 if result.outcome == "execution_error" else 0
86
+
87
+
88
+ def main() -> int:
89
+ parser = build_parser()
90
+ args = parser.parse_args()
91
+
92
+ if args.command == "score":
93
+ return run_score(args)
94
+
95
+ parser.error(f"Unknown command: {args.command}")
96
+ return 2
97
+
98
+
99
+ if __name__ == "__main__":
100
+ raise SystemExit(main())
File without changes
@@ -0,0 +1,62 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+
5
+ from fred_core.common import ConfigFiles, load_configuration_with_config_files
6
+ from pydantic import BaseModel
7
+
8
+
9
+ class JudgeProfileSettings(BaseModel):
10
+ api_base: str | None = None
11
+ api_key_env: str | None = None
12
+ request_timeout: int = 120
13
+
14
+
15
+ class JudgeProfile(BaseModel):
16
+ profile_id: str
17
+ provider: str
18
+ model: str
19
+ settings: JudgeProfileSettings = JudgeProfileSettings()
20
+
21
+
22
+ class JudgeConfig(BaseModel):
23
+ default: str
24
+ profiles: list[JudgeProfile]
25
+
26
+ def active(self) -> JudgeProfile:
27
+ for p in self.profiles:
28
+ if p.profile_id == self.default:
29
+ return p
30
+ raise ValueError(
31
+ f"Judge profile '{self.default}' not found. "
32
+ f"Available: {[p.profile_id for p in self.profiles]}"
33
+ )
34
+
35
+
36
+ class Configuration(BaseModel):
37
+ version: str = "v1"
38
+ judge: JudgeConfig
39
+
40
+
41
+ def parse_configuration(config_file: str) -> Configuration:
42
+ import yaml
43
+
44
+ with open(config_file, encoding="utf-8") as file:
45
+ payload = yaml.safe_load(file)
46
+
47
+ if payload is None:
48
+ raise ValueError(f"Configuration file is empty: {config_file}")
49
+ if not isinstance(payload, dict):
50
+ raise ValueError(f"Configuration file must be a mapping object: {config_file}")
51
+
52
+ return Configuration.model_validate(payload)
53
+
54
+
55
+ _config_files = ConfigFiles(logger=logging.getLogger(__name__))
56
+
57
+
58
+ def load_configuration() -> Configuration:
59
+ return load_configuration_with_config_files(
60
+ _config_files,
61
+ parse_configuration,
62
+ )
@@ -0,0 +1,94 @@
1
+ from __future__ import annotations
2
+
3
+ import httpx
4
+
5
+ from fred_deepeval_cli.core.models import (
6
+ EvaluationCaseRequest,
7
+ EvaluationCaseResult,
8
+ )
9
+ from fred_deepeval_cli.core.profiles import resolve_profile
10
+ from fred_deepeval_cli.core.structural_checks import build_structural_checks
11
+ from fred_deepeval_cli.core.scorer import score_trace
12
+
13
+
14
+ def classify_outcome(trace: dict) -> str:
15
+ if trace.get("error"):
16
+ return "execution_error"
17
+ if any(step.get("kind") == "awaiting_human" for step in trace.get("steps", [])):
18
+ return "hitl_blocked"
19
+ if any(step.get("kind") == "node_error" for step in trace.get("steps", [])):
20
+ return "degraded"
21
+ if trace.get("output"):
22
+ return "success"
23
+ return "unknown"
24
+
25
+
26
+ def fetch_trace(
27
+ base_url: str,
28
+ request: EvaluationCaseRequest,
29
+ access_token: str | None = None,
30
+ ) -> dict:
31
+ headers = {"Content-Type": "application/json"}
32
+ if access_token:
33
+ headers["Authorization"] = f"Bearer {access_token}"
34
+
35
+ payload = {
36
+ "agent_id": request.agent_id,
37
+ "input": request.input,
38
+ "session_id": request.session_id,
39
+ "runtime_context": request.runtime_context,
40
+ }
41
+
42
+ with httpx.Client(timeout=httpx.Timeout(30.0, connect=5.0, read=None)) as client:
43
+ response = client.post(
44
+ f"{base_url.rstrip('/')}/agents/evaluate",
45
+ json=payload,
46
+ headers=headers,
47
+ )
48
+ response.raise_for_status()
49
+ result = response.json()
50
+ if not isinstance(result, dict):
51
+ raise RuntimeError("Evaluate response must be a JSON object.")
52
+ return result
53
+
54
+
55
+ def evaluate_case_sync(
56
+ base_url: str,
57
+ request: EvaluationCaseRequest,
58
+ judge=None,
59
+ access_token: str | None = None,
60
+ ) -> EvaluationCaseResult:
61
+ try:
62
+ trace = fetch_trace(base_url, request, access_token=access_token)
63
+ except Exception as e:
64
+ return EvaluationCaseResult(
65
+ outcome="execution_error",
66
+ profile=request.profile,
67
+ structural_checks=[],
68
+ metrics=[],
69
+ execution_error=str(e),
70
+ )
71
+
72
+ outcome = classify_outcome(trace)
73
+ profile = resolve_profile(trace, explicit_profile=request.profile)
74
+ structural_checks = build_structural_checks(trace, profile=profile)
75
+
76
+ metrics, scoring_errors = [], []
77
+ if judge is not None:
78
+ metrics, scoring_errors = score_trace(
79
+ trace,
80
+ profile=profile,
81
+ expected_output=request.expected_output,
82
+ judge=judge,
83
+ )
84
+
85
+ return EvaluationCaseResult(
86
+ outcome=outcome,
87
+ profile=profile,
88
+ structural_checks=structural_checks,
89
+ metrics=metrics,
90
+ actual_output=trace.get("output"),
91
+ latency_ms=trace.get("latency_ms"),
92
+ execution_error=trace.get("error"),
93
+ scoring_errors=scoring_errors,
94
+ )
@@ -0,0 +1,52 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+
5
+ from fred_deepeval_cli.core.config_loader import load_configuration
6
+
7
+
8
+ def build_judge(config=None):
9
+ from deepeval.models.llms import GPTModel, LiteLLMModel
10
+
11
+ if config is None:
12
+ config = load_configuration()
13
+
14
+ profile = config.judge.active()
15
+ provider = profile.provider
16
+ model_name = profile.model
17
+ settings = profile.settings
18
+
19
+ if provider == "litellm":
20
+ api_key_env = settings.api_key_env or "LITELLM_API_KEY"
21
+ api_key = os.environ.get(api_key_env)
22
+ if not api_key:
23
+ raise RuntimeError(
24
+ f"Missing {api_key_env} in environment/.env for the litellm judge."
25
+ )
26
+ return LiteLLMModel(
27
+ model=model_name,
28
+ api_key=api_key,
29
+ base_url=settings.api_base,
30
+ request_timeout=settings.request_timeout,
31
+ num_retries=0,
32
+ )
33
+
34
+ if provider == "ollama":
35
+ return LiteLLMModel(
36
+ model=f"ollama/{model_name}",
37
+ api_key="ollama",
38
+ base_url=settings.api_base or "http://localhost:11434",
39
+ request_timeout=settings.request_timeout,
40
+ num_retries=0,
41
+ )
42
+
43
+ if provider == "openai":
44
+ api_key_env = settings.api_key_env or "OPENAI_API_KEY"
45
+ api_key = os.environ.get(api_key_env)
46
+ if not api_key:
47
+ raise RuntimeError(
48
+ f"Missing {api_key_env} in environment/.env for the openai judge."
49
+ )
50
+ return GPTModel(model=model_name)
51
+
52
+ raise ValueError(f"Unsupported judge provider: {provider}")
@@ -0,0 +1,41 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Literal
4
+ from pydantic import BaseModel
5
+
6
+
7
+ class EvaluationMetricResult(BaseModel):
8
+ name: str
9
+ provider: str
10
+ score: float | None
11
+ threshold: float | None = None
12
+ verdict: Literal["passed", "failed", "skipped", "error"]
13
+ explanation: str | None = None
14
+ error: str | None = None
15
+
16
+
17
+ class StructuralCheckResult(BaseModel):
18
+ name: str
19
+ passed: bool
20
+ detail: str | None = None
21
+
22
+
23
+ class EvaluationCaseRequest(BaseModel):
24
+ agent_id: str
25
+ input: str
26
+ session_id: str
27
+ expected_output: str | None = None
28
+ profile: str = "auto"
29
+ runtime_context: dict = {}
30
+
31
+
32
+ class EvaluationCaseResult(BaseModel):
33
+ schema_version: Literal["1"] = "1"
34
+ outcome: Literal["success", "execution_error", "degraded", "hitl_blocked", "unknown"]
35
+ profile: str
36
+ structural_checks: list[StructuralCheckResult]
37
+ metrics: list[EvaluationMetricResult]
38
+ latency_ms: int | None = None
39
+ actual_output: str | None = None
40
+ execution_error: str | None = None
41
+ scoring_errors: list[str] = []
@@ -0,0 +1,21 @@
1
+ from __future__ import annotations
2
+
3
+ SUPPORTED_PROFILES = {"rag", "sql", "workflow", "default"}
4
+
5
+
6
+ def resolve_profile(trace: dict, explicit_profile: str = "auto") -> str:
7
+ if explicit_profile != "auto" and explicit_profile in SUPPORTED_PROFILES:
8
+ return explicit_profile
9
+
10
+ agent_tags = set(trace.get("agent_tags", []))
11
+
12
+ if "rag" in agent_tags:
13
+ return "rag"
14
+
15
+ if "sql" in agent_tags:
16
+ return "sql"
17
+
18
+ if "workflow" in agent_tags:
19
+ return "workflow"
20
+
21
+ return "default"
@@ -0,0 +1,74 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+
5
+ from deepeval.test_case import LLMTestCase
6
+
7
+ from fred_deepeval_cli.core.models import EvaluationMetricResult
8
+
9
+ logging.getLogger("LiteLLM").setLevel(logging.CRITICAL)
10
+ logging.getLogger("root").setLevel(logging.CRITICAL)
11
+
12
+
13
+ def _trace_to_test_case(trace: dict, expected_output: str | None = None) -> LLMTestCase:
14
+ return LLMTestCase(
15
+ input=trace.get("input", ""),
16
+ actual_output=trace.get("output") or "",
17
+ expected_output=expected_output,
18
+ retrieval_context=trace.get("retrieval_context", []) or [],
19
+ )
20
+
21
+
22
+ def score_trace(
23
+ trace: dict,
24
+ profile: str = "default",
25
+ expected_output: str | None = None,
26
+ judge=None,
27
+ ) -> tuple[list[EvaluationMetricResult], list[str]]:
28
+ from deepeval.metrics import (
29
+ AnswerRelevancyMetric,
30
+ ContextualPrecisionMetric,
31
+ ContextualRecallMetric,
32
+ ContextualRelevancyMetric,
33
+ FaithfulnessMetric,
34
+ )
35
+
36
+ test_case = _trace_to_test_case(trace, expected_output=expected_output)
37
+ retrieval_context = trace.get("retrieval_context") or []
38
+
39
+ def _metric(cls, **kwargs):
40
+ return cls(model=judge, async_mode=False, **kwargs)
41
+
42
+ metrics = [_metric(AnswerRelevancyMetric)]
43
+
44
+ if profile == "rag" and retrieval_context:
45
+ metrics.append(_metric(FaithfulnessMetric))
46
+ metrics.append(_metric(ContextualRelevancyMetric))
47
+ if expected_output:
48
+ metrics.append(_metric(ContextualPrecisionMetric))
49
+ metrics.append(_metric(ContextualRecallMetric))
50
+
51
+ results: list[EvaluationMetricResult] = []
52
+ scoring_errors: list[str] = []
53
+
54
+ for metric in metrics:
55
+ try:
56
+ metric.measure(test_case)
57
+ results.append(EvaluationMetricResult(
58
+ name=metric.__class__.__name__,
59
+ provider="deepeval",
60
+ score=metric.score,
61
+ verdict="passed" if metric.success else "failed",
62
+ explanation=getattr(metric, "reason", None),
63
+ ))
64
+ except Exception as e:
65
+ scoring_errors.append(f"{metric.__class__.__name__}: {e}")
66
+ results.append(EvaluationMetricResult(
67
+ name=metric.__class__.__name__,
68
+ provider="deepeval",
69
+ score=None,
70
+ verdict="error",
71
+ error=str(e),
72
+ ))
73
+
74
+ return results, scoring_errors
@@ -0,0 +1,55 @@
1
+ from __future__ import annotations
2
+
3
+ from fred_deepeval_cli.core.models import StructuralCheckResult
4
+
5
+
6
+ def _tool_steps(trace: dict, kind: str, tool_name: str) -> list[dict]:
7
+ return [
8
+ step
9
+ for step in trace.get("steps", [])
10
+ if step.get("kind") == kind and step.get("tool_name") == tool_name
11
+ ]
12
+
13
+
14
+ def _has_tool_call(trace: dict, tool_name: str) -> bool:
15
+ return bool(_tool_steps(trace, "tool_call", tool_name))
16
+
17
+
18
+ def _has_successful_tool_result(trace: dict, tool_name: str) -> bool:
19
+ for step in _tool_steps(trace, "tool_result", tool_name):
20
+ if step.get("is_error"):
21
+ continue
22
+ content = step.get("content") or ""
23
+ if isinstance(content, str) and content.strip():
24
+ if not content.lstrip().startswith("Error:"):
25
+ return True
26
+ return False
27
+
28
+
29
+ def build_structural_checks(trace: dict, profile: str = "default") -> list[StructuralCheckResult]:
30
+ checks = []
31
+
32
+ if profile == "rag":
33
+ checks.append(StructuralCheckResult(
34
+ name="rag_tool_used",
35
+ passed="knowledge_search" in trace.get("tools_called", []),
36
+ ))
37
+ checks.append(StructuralCheckResult(
38
+ name="rag_context_nonempty",
39
+ passed=bool(trace.get("retrieval_context")),
40
+ ))
41
+
42
+ elif profile == "sql":
43
+ checks.append(StructuralCheckResult(
44
+ name="sql_query_executed",
45
+ passed=_has_tool_call(trace, "read_query") and _has_successful_tool_result(trace, "read_query"),
46
+ ))
47
+ checks.append(StructuralCheckResult(
48
+ name="sql_no_execution_error",
49
+ passed=not trace.get("error") and not any(
50
+ s.get("kind") == "node_error" or s.get("is_error")
51
+ for s in trace.get("steps", [])
52
+ ),
53
+ ))
54
+
55
+ return checks
@@ -0,0 +1,115 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import timedelta
4
+
5
+ from temporalio import activity, workflow
6
+ from temporalio.client import Client
7
+ from temporalio.worker import Worker, UnsandboxedWorkflowRunner
8
+ from temporalio.testing import WorkflowEnvironment
9
+
10
+ from fred_deepeval_cli.core.models import EvaluationCaseRequest
11
+ from fred_deepeval_cli.core.evaluator import evaluate_case_sync
12
+ from fred_deepeval_cli.core.judge_factory import build_judge
13
+
14
+
15
+ @activity.defn
16
+ async def evaluate_question_activity(params: dict) -> dict:
17
+ """Une question = une activity Temporal."""
18
+ request = EvaluationCaseRequest(
19
+ agent_id=params["agent_id"],
20
+ input=params["input"],
21
+ session_id=params["session_id"],
22
+ expected_output=params.get("expected_answer"),
23
+ profile=params.get("profile", "auto"),
24
+ runtime_context={
25
+ "user_id": params["user_id"],
26
+ **({"team_id": params["team_id"]} if params.get("team_id") else {}),
27
+ **({"search_policy": params["search_policy"]} if params.get("search_policy") else {}),
28
+ },
29
+ )
30
+
31
+ try:
32
+ judge = build_judge()
33
+ result = evaluate_case_sync(
34
+ base_url=params["base_url"],
35
+ request=request,
36
+ judge=judge,
37
+ access_token=params.get("access_token"),
38
+ )
39
+ except Exception as e:
40
+ return {
41
+ "id": params["id"],
42
+ "input": params["input"],
43
+ "outcome": "error",
44
+ "profile": "unknown",
45
+ "rag_ok": False,
46
+ "structural_checks": [],
47
+ "metrics": [],
48
+ "error": str(e),
49
+ }
50
+
51
+ rag_ok = all(c.passed for c in result.structural_checks)
52
+ metrics_by_name = {m.name: m.model_dump() for m in result.metrics}
53
+
54
+ return {
55
+ "id": params["id"],
56
+ "input": params["input"],
57
+ "outcome": result.outcome,
58
+ "profile": result.profile,
59
+ "rag_ok": rag_ok,
60
+ "structural_checks": [c.model_dump() for c in result.structural_checks],
61
+ "metrics": metrics_by_name,
62
+ }
63
+
64
+
65
+ @workflow.defn
66
+ class RagDatasetWorkflow:
67
+ @workflow.run
68
+ async def run(self, questions: list[dict]) -> list[dict]:
69
+ results = []
70
+ for q in questions:
71
+ result = await workflow.execute_activity(
72
+ evaluate_question_activity,
73
+ q,
74
+ start_to_close_timeout=timedelta(minutes=10),
75
+ )
76
+ results.append(result)
77
+ return results
78
+
79
+
80
+ async def run_with_temporal(questions: list[dict]) -> list[dict]:
81
+ """Lance le workflow en mode in-memory (pas de serveur Temporal requis)."""
82
+ async with await WorkflowEnvironment.start_local() as env:
83
+ async with Worker(
84
+ env.client,
85
+ task_queue="rag-eval",
86
+ workflows=[RagDatasetWorkflow],
87
+ activities=[evaluate_question_activity],
88
+ workflow_runner=UnsandboxedWorkflowRunner(),
89
+ ):
90
+ results: list[dict] = await env.client.execute_workflow(
91
+ RagDatasetWorkflow.run,
92
+ questions,
93
+ id="rag-dataset-eval",
94
+ task_queue="rag-eval",
95
+ )
96
+ return results
97
+
98
+
99
+ async def run_with_temporal_server(questions: list[dict], server_url: str) -> list[dict]:
100
+ """Lance le workflow sur un serveur Temporal réel (production)."""
101
+ client = await Client.connect(server_url)
102
+ async with Worker(
103
+ client,
104
+ task_queue="rag-eval",
105
+ workflows=[RagDatasetWorkflow],
106
+ activities=[evaluate_question_activity],
107
+ workflow_runner=UnsandboxedWorkflowRunner(),
108
+ ):
109
+ results: list[dict] = await client.execute_workflow(
110
+ RagDatasetWorkflow.run,
111
+ questions,
112
+ id="rag-dataset-eval",
113
+ task_queue="rag-eval",
114
+ )
115
+ return results
@@ -0,0 +1,44 @@
1
+ from __future__ import annotations
2
+
3
+ import httpx
4
+
5
+
6
+ def make_response(payload: dict) -> httpx.Response:
7
+ request = httpx.Request(
8
+ "POST",
9
+ "http://127.0.0.1:8000/fred/agents/v2/agents/evaluate",
10
+ )
11
+ return httpx.Response(200, json=payload, request=request)
12
+
13
+
14
+ def make_trace(
15
+ *,
16
+ session_id: str = "eval-001",
17
+ agent_id: str = "fred.test.assistant",
18
+ agent_tags: list[str] | None = None,
19
+ input: str = "echo bonjour",
20
+ output: str | None = "Echo: echo bonjour",
21
+ error: str | None = None,
22
+ latency_ms: int = 123,
23
+ model_name: str | None = None,
24
+ token_usage: dict | None = None,
25
+ finish_reason: str | None = None,
26
+ steps: list[dict] | None = None,
27
+ retrieval_context: list[str] | None = None,
28
+ tools_called: list[str] | None = None,
29
+ ) -> dict:
30
+ return {
31
+ "session_id": session_id,
32
+ "agent_id": agent_id,
33
+ "agent_tags": agent_tags or [],
34
+ "input": input,
35
+ "output": output,
36
+ "error": error,
37
+ "latency_ms": latency_ms,
38
+ "model_name": model_name,
39
+ "token_usage": token_usage,
40
+ "finish_reason": finish_reason,
41
+ "steps": steps or [],
42
+ "retrieval_context": retrieval_context or [],
43
+ "tools_called": tools_called or [],
44
+ }
@@ -0,0 +1,22 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+
5
+ from fred_deepeval_cli.core.evaluator import evaluate_case_sync
6
+ from fred_deepeval_cli.core.models import EvaluationCaseRequest, EvaluationCaseResult
7
+
8
+
9
+ async def evaluate_case(
10
+ request: EvaluationCaseRequest,
11
+ *,
12
+ base_url: str,
13
+ judge=None,
14
+ access_token: str | None = None,
15
+ ) -> EvaluationCaseResult:
16
+ return await asyncio.to_thread(
17
+ evaluate_case_sync,
18
+ base_url,
19
+ request,
20
+ judge=judge,
21
+ access_token=access_token,
22
+ )
@@ -0,0 +1,70 @@
1
+ Metadata-Version: 2.4
2
+ Name: fred-deepeval-cli
3
+ Version: 0.1.0
4
+ Summary: External CLI for evaluating Fred agent turns via /agents/evaluate
5
+ License: Apache-2.0
6
+ Requires-Python: <3.13,>=3.12
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: fred-sdk>=2.0.7
9
+ Requires-Dist: fred-runtime>=2.0.8
10
+ Provides-Extra: dev
11
+ Requires-Dist: bandit>=1.8.6; extra == "dev"
12
+ Requires-Dist: basedpyright==1.31.0; extra == "dev"
13
+ Requires-Dist: detect-secrets>=1.5.0; extra == "dev"
14
+ Requires-Dist: pytest>=8.4.2; extra == "dev"
15
+ Requires-Dist: pytest-cov>=6.2.1; extra == "dev"
16
+ Requires-Dist: pytest-socket>=0.7.0; extra == "dev"
17
+ Requires-Dist: ruff>=0.12.5; extra == "dev"
18
+ Provides-Extra: eval
19
+ Requires-Dist: deepeval; extra == "eval"
20
+ Requires-Dist: litellm; extra == "eval"
21
+ Requires-Dist: python-dotenv; extra == "eval"
22
+ Requires-Dist: rich>=13.0; extra == "eval"
23
+ Requires-Dist: temporalio; extra == "eval"
24
+
25
+ # fred-deepeval-cli
26
+
27
+ External CLI for evaluating one Fred agent turn through `POST /agents/evaluate`.
28
+
29
+ ## Purpose
30
+
31
+ This project provides a small external CLI that:
32
+ - calls a Fred pod `/agents/evaluate` endpoint
33
+ - receives an `EvalTrace`
34
+ - classifies the turn outcome
35
+ - resolves an evaluation preset from `agent_tags`
36
+ - computes structural checks
37
+ - scores the trace with DeepEval
38
+
39
+ ## Commands
40
+
41
+ ```bash
42
+ make dev
43
+ make eval-dev
44
+ make test
45
+ make code-quality
46
+ make cli
47
+ make score BASE_URL=http://127.0.0.1:8000/fred/agents/v2 AGENT_ID=fred.test.assistant INPUT="echo bonjour" SESSION_ID=eval-001 USER_ID=alice
48
+ make sql-scenarios BASE_URL=http://127.0.0.1:8000/fred/agents/v2
49
+
50
+ ## Documentation
51
+
52
+ | Topic | File |
53
+ | --- | --- |
54
+ | Evaluate any Fred agent pod | `docs/evaluating-any-fred-agent.md` |
55
+ | RAG evaluation — approach and metrics | `docs/rag-evaluation-rfc.md` |
56
+ | RAG local setup guide | `docs/rag-local-setup.md` |
57
+ | SQL evaluation | `docs/sql-evaluation.md` |
58
+ | OTel export strategy | `fred/docs/swift/rfc/AGENT-EVALUATION-RFC.md §13` |
59
+
60
+ ## Architecture — EVAL-01 Phase 1
61
+
62
+ This CLI is being restructured into a reusable library core so the Fred
63
+ Control Plane evaluation worker can call it directly without spawning a subprocess.
64
+
65
+ - `fred_deepeval_cli/core/` — callable library (models, evaluator, profiles, scorer, judge factory)
66
+ - `fred_deepeval_cli/cli/` — thin CLI adapter over the core
67
+ - `fred_deepeval_cli/worker_adapter.py` — public entrypoint for the Control Plane worker
68
+
69
+ The CLI interface and JSON output remain unchanged.
70
+ See EVAL-01 Phase 1 issue and `fred/docs/swift/rfc/AGENT-EVALUATION-RFC.md §7.3`.
@@ -0,0 +1,20 @@
1
+ fred_deepeval_cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ fred_deepeval_cli/dataset_workflow.py,sha256=6sbuVdqQd5cAHWDC2_7RVLp3eDMUL5GgerH5BwXnGLs,3845
3
+ fred_deepeval_cli/test_helpers.py,sha256=hNXs76H61svc-c05_Cgv1w1Sa0_owZRX-pve0WG8QQk,1278
4
+ fred_deepeval_cli/worker_adapter.py,sha256=vkbT1BYUVlI-Pe6P7y-dKCZLWPKUysGGSdXeS-CDCwY,531
5
+ fred_deepeval_cli/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ fred_deepeval_cli/cli/display.py,sha256=Z4ZWImcdwmGM0s1E0UCmohqZhK_Nt6FzEeeRLaNfeY4,8764
7
+ fred_deepeval_cli/cli/main.py,sha256=BpW0XYnRyTukxOUfj0fS6wHZSSSUgoQfwPKRPjuNS5M,3148
8
+ fred_deepeval_cli/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
+ fred_deepeval_cli/core/config_loader.py,sha256=JzeYdjrcCY7WaoqxLesIMorzdE70zcjMtmzhqrGrldA,1574
10
+ fred_deepeval_cli/core/evaluator.py,sha256=TYMW_nlXQWl8d2hCJ8QhCWY0T_PwjYvp-rr385RFMSs,2873
11
+ fred_deepeval_cli/core/judge_factory.py,sha256=23Crpu1-KLVjh27_DP9asmfOQit9mDqKDy-ja06Q8ik,1603
12
+ fred_deepeval_cli/core/models.py,sha256=Q1LAxPyYJTyiwSsgJEGfsjVxJBWSWtY0fJ2OuVljY9g,1071
13
+ fred_deepeval_cli/core/profiles.py,sha256=6LmKMt6SjQFBKfz0CNiYkmVvlTf_vJl8jid1Lq73UJE,512
14
+ fred_deepeval_cli/core/scorer.py,sha256=dmdLNKWg4ujR2Pn9qkYjUCYanFM8dP6-uz679GjxPGE,2409
15
+ fred_deepeval_cli/core/structural_checks.py,sha256=-NNCZyo4rNeJ0BIKcOh-XdYNrCwOqPZBjQ1MzS35lwA,1827
16
+ fred_deepeval_cli-0.1.0.dist-info/METADATA,sha256=FUbvNEH_yQ8fqPRz3B85QlftG7K9xlC5erZ9toLbSm4,2513
17
+ fred_deepeval_cli-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
18
+ fred_deepeval_cli-0.1.0.dist-info/entry_points.txt,sha256=Eq5rEKpSr9gTzWhuU4NOJrOE389NzfnMk8kPxAvmVXw,70
19
+ fred_deepeval_cli-0.1.0.dist-info/top_level.txt,sha256=wUGpDJqehShF9gIzSY1xf07RuzSHi6oq5oJ1SUCBLkk,18
20
+ fred_deepeval_cli-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ fred-deepeval-cli = fred_deepeval_cli.cli.main:main
@@ -0,0 +1 @@
1
+ fred_deepeval_cli