multivon-eval 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- multivon_eval/__init__.py +56 -0
- multivon_eval/case.py +57 -0
- multivon_eval/cli.py +72 -0
- multivon_eval/dataset.py +50 -0
- multivon_eval/evaluators/__init__.py +35 -0
- multivon_eval/evaluators/agent.py +223 -0
- multivon_eval/evaluators/base.py +27 -0
- multivon_eval/evaluators/conversation.py +136 -0
- multivon_eval/evaluators/deterministic.py +252 -0
- multivon_eval/evaluators/llm_judge.py +396 -0
- multivon_eval/reporters/__init__.py +0 -0
- multivon_eval/reporters/terminal.py +64 -0
- multivon_eval/result.py +145 -0
- multivon_eval/suite.py +191 -0
- multivon_eval-0.1.0.dist-info/METADATA +387 -0
- multivon_eval-0.1.0.dist-info/RECORD +20 -0
- multivon_eval-0.1.0.dist-info/WHEEL +5 -0
- multivon_eval-0.1.0.dist-info/entry_points.txt +2 -0
- multivon_eval-0.1.0.dist-info/licenses/LICENSE +185 -0
- multivon_eval-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""
|
|
2
|
+
multivon-eval — AI evaluation for teams that ship models to production.
|
|
3
|
+
|
|
4
|
+
Evaluate language models, agents, and pipelines across text, RAG,
|
|
5
|
+
agentic workflows, and multi-turn conversations.
|
|
6
|
+
|
|
7
|
+
Quick start:
|
|
8
|
+
from multivon_eval import EvalSuite, EvalCase, Relevance, Faithfulness, NotEmpty
|
|
9
|
+
|
|
10
|
+
suite = EvalSuite("My Eval")
|
|
11
|
+
suite.add_cases([EvalCase(input="What is 2+2?", expected_output="4")])
|
|
12
|
+
suite.add_evaluators(NotEmpty(), Relevance())
|
|
13
|
+
report = suite.run(my_model_fn)
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
__version__ = "0.1.0"
|
|
17
|
+
|
|
18
|
+
from .suite import EvalSuite
|
|
19
|
+
from .case import EvalCase, AgentStep, ToolCall
|
|
20
|
+
from .dataset import load, load_jsonl, load_csv
|
|
21
|
+
from .evaluators import (
|
|
22
|
+
# Deterministic
|
|
23
|
+
NotEmpty, ExactMatch, Contains, RegexMatch,
|
|
24
|
+
JSONSchemaEval, WordCount, Latency, MaxLatency,
|
|
25
|
+
BLEU, ROUGE, StartsWith,
|
|
26
|
+
# LLM-as-judge (QAG)
|
|
27
|
+
Faithfulness, Hallucination, Relevance, Coherence,
|
|
28
|
+
Toxicity, Bias, Summarization, AnswerAccuracy,
|
|
29
|
+
ContextPrecision, ContextRecall, CustomRubric, GEval,
|
|
30
|
+
# Agent
|
|
31
|
+
ToolCallAccuracy, ToolArgumentAccuracy,
|
|
32
|
+
PlanQuality, TaskCompletion, StepFaithfulness,
|
|
33
|
+
# Conversation
|
|
34
|
+
ConversationRelevance, KnowledgeRetention,
|
|
35
|
+
ConversationCompleteness, TurnConsistency,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
__all__ = [
|
|
39
|
+
"__version__",
|
|
40
|
+
"EvalSuite", "EvalCase", "AgentStep", "ToolCall",
|
|
41
|
+
"load", "load_jsonl", "load_csv",
|
|
42
|
+
# Deterministic
|
|
43
|
+
"NotEmpty", "ExactMatch", "Contains", "RegexMatch",
|
|
44
|
+
"JSONSchemaEval", "WordCount", "Latency", "MaxLatency",
|
|
45
|
+
"BLEU", "ROUGE", "StartsWith",
|
|
46
|
+
# LLM-as-judge
|
|
47
|
+
"Faithfulness", "Hallucination", "Relevance", "Coherence",
|
|
48
|
+
"Toxicity", "Bias", "Summarization", "AnswerAccuracy",
|
|
49
|
+
"ContextPrecision", "ContextRecall", "CustomRubric", "GEval",
|
|
50
|
+
# Agent
|
|
51
|
+
"ToolCallAccuracy", "ToolArgumentAccuracy",
|
|
52
|
+
"PlanQuality", "TaskCompletion", "StepFaithfulness",
|
|
53
|
+
# Conversation
|
|
54
|
+
"ConversationRelevance", "KnowledgeRetention",
|
|
55
|
+
"ConversationCompleteness", "TurnConsistency",
|
|
56
|
+
]
|
multivon_eval/case.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class ToolCall:
|
|
8
|
+
"""A single tool/function call made by an agent."""
|
|
9
|
+
name: str
|
|
10
|
+
arguments: dict[str, Any] = field(default_factory=dict)
|
|
11
|
+
result: Any = None
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class AgentStep:
|
|
16
|
+
"""One step in an agent's execution trace."""
|
|
17
|
+
thought: str = ""
|
|
18
|
+
tool_calls: list[ToolCall] = field(default_factory=list)
|
|
19
|
+
output: str = ""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class EvalCase:
|
|
24
|
+
"""
|
|
25
|
+
A single test case for evaluation.
|
|
26
|
+
|
|
27
|
+
Attributes:
|
|
28
|
+
input: The prompt, question, or user message.
|
|
29
|
+
expected_output: Ideal response (used by ExactMatch, AnswerAccuracy).
|
|
30
|
+
context: Retrieved documents or context (used by Faithfulness, Hallucination).
|
|
31
|
+
conversation: Multi-turn message history for conversation evaluators.
|
|
32
|
+
Format: [{"role": "user"/"assistant", "content": "..."}]
|
|
33
|
+
agent_trace: Sequence of agent steps for agent evaluators.
|
|
34
|
+
expected_tool_calls: Ordered list of tool names the agent should call.
|
|
35
|
+
metadata: Arbitrary key-value data (e.g. source_id, difficulty).
|
|
36
|
+
tags: Labels for filtering reports.
|
|
37
|
+
"""
|
|
38
|
+
input: str
|
|
39
|
+
expected_output: str | None = None
|
|
40
|
+
context: str | list[str] | None = None
|
|
41
|
+
conversation: list[dict[str, str]] | None = None
|
|
42
|
+
agent_trace: list[AgentStep] | None = None
|
|
43
|
+
expected_tool_calls: list[str] | None = None
|
|
44
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
45
|
+
tags: list[str] = field(default_factory=list)
|
|
46
|
+
|
|
47
|
+
def context_str(self) -> str:
|
|
48
|
+
if self.context is None:
|
|
49
|
+
return ""
|
|
50
|
+
if isinstance(self.context, list):
|
|
51
|
+
return "\n\n".join(self.context)
|
|
52
|
+
return self.context
|
|
53
|
+
|
|
54
|
+
def conversation_str(self) -> str:
|
|
55
|
+
if not self.conversation:
|
|
56
|
+
return ""
|
|
57
|
+
return "\n".join(f"{m['role'].upper()}: {m['content']}" for m in self.conversation)
|
multivon_eval/cli.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""
|
|
2
|
+
multivon-eval CLI — run eval files and view reports.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
multivon-eval run eval.py
|
|
6
|
+
multivon-eval report results.json
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
import sys
|
|
10
|
+
import json
|
|
11
|
+
import argparse
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def cmd_run(args):
|
|
15
|
+
"""Execute a Python eval file."""
|
|
16
|
+
import runpy
|
|
17
|
+
runpy.run_path(args.file, run_name="__main__")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def cmd_report(args):
|
|
21
|
+
"""Pretty-print a saved JSON report."""
|
|
22
|
+
from rich.console import Console
|
|
23
|
+
from rich.table import Table
|
|
24
|
+
from rich import box
|
|
25
|
+
|
|
26
|
+
console = Console()
|
|
27
|
+
with open(args.file) as f:
|
|
28
|
+
data = json.load(f)
|
|
29
|
+
|
|
30
|
+
console.rule(f"[bold]{data.get('suite', 'Eval Report')}[/]")
|
|
31
|
+
if data.get("model"):
|
|
32
|
+
console.print(f" Model: [dim]{data['model']}[/]")
|
|
33
|
+
|
|
34
|
+
summary = data.get("summary", {})
|
|
35
|
+
console.print(f"\n Total: {summary.get('total')} "
|
|
36
|
+
f"Passed: [green]{summary.get('passed')}[/] "
|
|
37
|
+
f"Failed: [red]{summary.get('failed')}[/] "
|
|
38
|
+
f"Pass Rate: {summary.get('pass_rate', 0):.1%}\n")
|
|
39
|
+
|
|
40
|
+
by_ev = summary.get("by_evaluator", {})
|
|
41
|
+
if by_ev:
|
|
42
|
+
t = Table(box=box.SIMPLE_HEAD, title="By Evaluator", padding=(0, 1))
|
|
43
|
+
t.add_column("Evaluator")
|
|
44
|
+
t.add_column("Avg Score", justify="right")
|
|
45
|
+
for name, score in by_ev.items():
|
|
46
|
+
color = "green" if score >= 0.7 else "yellow" if score >= 0.5 else "red"
|
|
47
|
+
t.add_row(name, f"[{color}]{score:.3f}[/]")
|
|
48
|
+
console.print(t)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def main():
|
|
52
|
+
parser = argparse.ArgumentParser(prog="multivon-eval", description="Multivon Eval CLI")
|
|
53
|
+
sub = parser.add_subparsers(dest="command")
|
|
54
|
+
|
|
55
|
+
run_p = sub.add_parser("run", help="Execute an eval file")
|
|
56
|
+
run_p.add_argument("file", help="Python eval file to run")
|
|
57
|
+
|
|
58
|
+
report_p = sub.add_parser("report", help="Display a saved JSON report")
|
|
59
|
+
report_p.add_argument("file", help="JSON results file")
|
|
60
|
+
|
|
61
|
+
args = parser.parse_args()
|
|
62
|
+
if args.command == "run":
|
|
63
|
+
cmd_run(args)
|
|
64
|
+
elif args.command == "report":
|
|
65
|
+
cmd_report(args)
|
|
66
|
+
else:
|
|
67
|
+
parser.print_help()
|
|
68
|
+
sys.exit(1)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
if __name__ == "__main__":
|
|
72
|
+
main()
|
multivon_eval/dataset.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import csv
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from .case import EvalCase
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def load_jsonl(path: str) -> list[EvalCase]:
|
|
9
|
+
"""Load test cases from a JSONL file. Each line is a JSON object."""
|
|
10
|
+
cases = []
|
|
11
|
+
with open(path) as f:
|
|
12
|
+
for line in f:
|
|
13
|
+
line = line.strip()
|
|
14
|
+
if not line:
|
|
15
|
+
continue
|
|
16
|
+
data = json.loads(line)
|
|
17
|
+
cases.append(EvalCase(
|
|
18
|
+
input=data["input"],
|
|
19
|
+
expected_output=data.get("expected_output"),
|
|
20
|
+
context=data.get("context"),
|
|
21
|
+
metadata=data.get("metadata", {}),
|
|
22
|
+
tags=data.get("tags", []),
|
|
23
|
+
))
|
|
24
|
+
return cases
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def load_csv(path: str) -> list[EvalCase]:
|
|
28
|
+
"""Load test cases from a CSV file with columns: input, expected_output, context, tags."""
|
|
29
|
+
cases = []
|
|
30
|
+
with open(path, newline="") as f:
|
|
31
|
+
reader = csv.DictReader(f)
|
|
32
|
+
for row in reader:
|
|
33
|
+
tags = [t.strip() for t in row.get("tags", "").split(",") if t.strip()]
|
|
34
|
+
cases.append(EvalCase(
|
|
35
|
+
input=row["input"],
|
|
36
|
+
expected_output=row.get("expected_output") or None,
|
|
37
|
+
context=row.get("context") or None,
|
|
38
|
+
tags=tags,
|
|
39
|
+
))
|
|
40
|
+
return cases
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def load(path: str) -> list[EvalCase]:
|
|
44
|
+
"""Auto-detect format from file extension and load cases."""
|
|
45
|
+
p = Path(path)
|
|
46
|
+
if p.suffix == ".jsonl":
|
|
47
|
+
return load_jsonl(path)
|
|
48
|
+
elif p.suffix == ".csv":
|
|
49
|
+
return load_csv(path)
|
|
50
|
+
raise ValueError(f"Unsupported format: {p.suffix}. Use .jsonl or .csv")
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from .deterministic import (
|
|
2
|
+
NotEmpty, ExactMatch, Contains, RegexMatch,
|
|
3
|
+
JSONSchemaEval, WordCount, Latency, MaxLatency,
|
|
4
|
+
BLEU, ROUGE, StartsWith,
|
|
5
|
+
)
|
|
6
|
+
from .llm_judge import (
|
|
7
|
+
Faithfulness, Hallucination, Relevance, Coherence,
|
|
8
|
+
Toxicity, Bias, Summarization, AnswerAccuracy,
|
|
9
|
+
ContextPrecision, ContextRecall, CustomRubric, GEval,
|
|
10
|
+
)
|
|
11
|
+
from .agent import (
|
|
12
|
+
ToolCallAccuracy, ToolArgumentAccuracy,
|
|
13
|
+
PlanQuality, TaskCompletion, StepFaithfulness,
|
|
14
|
+
)
|
|
15
|
+
from .conversation import (
|
|
16
|
+
ConversationRelevance, KnowledgeRetention,
|
|
17
|
+
ConversationCompleteness, TurnConsistency,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
# Deterministic
|
|
22
|
+
"NotEmpty", "ExactMatch", "Contains", "RegexMatch",
|
|
23
|
+
"JSONSchemaEval", "WordCount", "Latency", "MaxLatency",
|
|
24
|
+
"BLEU", "ROUGE", "StartsWith",
|
|
25
|
+
# LLM-as-judge
|
|
26
|
+
"Faithfulness", "Hallucination", "Relevance", "Coherence",
|
|
27
|
+
"Toxicity", "Bias", "Summarization", "AnswerAccuracy",
|
|
28
|
+
"ContextPrecision", "ContextRecall", "CustomRubric", "GEval",
|
|
29
|
+
# Agent
|
|
30
|
+
"ToolCallAccuracy", "ToolArgumentAccuracy",
|
|
31
|
+
"PlanQuality", "TaskCompletion", "StepFaithfulness",
|
|
32
|
+
# Conversation
|
|
33
|
+
"ConversationRelevance", "KnowledgeRetention",
|
|
34
|
+
"ConversationCompleteness", "TurnConsistency",
|
|
35
|
+
]
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Agent evaluators — evaluate multi-step AI agent execution traces.
|
|
3
|
+
|
|
4
|
+
These evaluators operate on AgentStep traces attached to EvalCase,
|
|
5
|
+
not just the final output string. This is the key differentiator:
|
|
6
|
+
framework-agnostic evaluation of tool use, planning, and task completion.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
import json
|
|
10
|
+
import re
|
|
11
|
+
|
|
12
|
+
from .base import Evaluator
|
|
13
|
+
from .llm_judge import _judge_call, _parse_yes_no, _qag_eval
|
|
14
|
+
from ..case import EvalCase, AgentStep
|
|
15
|
+
from ..result import EvalResult
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _trace_str(trace: list[AgentStep]) -> str:
|
|
19
|
+
"""Render an agent trace as readable text for the judge."""
|
|
20
|
+
lines = []
|
|
21
|
+
for i, step in enumerate(trace, 1):
|
|
22
|
+
lines.append(f"Step {i}:")
|
|
23
|
+
if step.thought:
|
|
24
|
+
lines.append(f" Thought: {step.thought}")
|
|
25
|
+
for tc in step.tool_calls:
|
|
26
|
+
args = json.dumps(tc.arguments, indent=2) if tc.arguments else "{}"
|
|
27
|
+
result_str = str(tc.result)[:200] if tc.result is not None else "(no result)"
|
|
28
|
+
lines.append(f" Tool call: {tc.name}({args})")
|
|
29
|
+
lines.append(f" Result: {result_str}")
|
|
30
|
+
if step.output:
|
|
31
|
+
lines.append(f" Output: {step.output}")
|
|
32
|
+
return "\n".join(lines)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class ToolCallAccuracy(Evaluator):
|
|
36
|
+
"""
|
|
37
|
+
Evaluates whether the agent called the right tools in the right order.
|
|
38
|
+
|
|
39
|
+
Checks:
|
|
40
|
+
- Were all expected tools called?
|
|
41
|
+
- Were they called in the correct order (if order matters)?
|
|
42
|
+
- Were any unexpected tools called?
|
|
43
|
+
|
|
44
|
+
Requires case.agent_trace and case.expected_tool_calls.
|
|
45
|
+
"""
|
|
46
|
+
name = "tool_call_accuracy"
|
|
47
|
+
|
|
48
|
+
def __init__(self, require_order: bool = False, threshold: float = 0.7):
|
|
49
|
+
super().__init__(threshold)
|
|
50
|
+
self.require_order = require_order
|
|
51
|
+
|
|
52
|
+
def evaluate(self, case: EvalCase, output: str) -> EvalResult:
|
|
53
|
+
if not case.agent_trace:
|
|
54
|
+
return self._result(0.0, "No agent_trace provided")
|
|
55
|
+
if not case.expected_tool_calls:
|
|
56
|
+
return self._result(0.0, "No expected_tool_calls provided")
|
|
57
|
+
|
|
58
|
+
actual_calls = [
|
|
59
|
+
tc.name
|
|
60
|
+
for step in case.agent_trace
|
|
61
|
+
for tc in step.tool_calls
|
|
62
|
+
]
|
|
63
|
+
expected = case.expected_tool_calls
|
|
64
|
+
|
|
65
|
+
if self.require_order:
|
|
66
|
+
# Ordered match
|
|
67
|
+
matches = sum(1 for a, e in zip(actual_calls, expected) if a == e)
|
|
68
|
+
score = matches / len(expected)
|
|
69
|
+
missing = [e for e in expected if e not in actual_calls]
|
|
70
|
+
unexpected = [a for a in actual_calls if a not in expected]
|
|
71
|
+
else:
|
|
72
|
+
# Unordered: fraction of expected tools that were called
|
|
73
|
+
called_set = set(actual_calls)
|
|
74
|
+
expected_set = set(expected)
|
|
75
|
+
matched = called_set & expected_set
|
|
76
|
+
score = len(matched) / len(expected_set)
|
|
77
|
+
missing = list(expected_set - called_set)
|
|
78
|
+
unexpected = list(called_set - expected_set)
|
|
79
|
+
|
|
80
|
+
reasons = [f"Called: {actual_calls}", f"Expected: {expected}"]
|
|
81
|
+
if missing:
|
|
82
|
+
reasons.append(f"Missing tools: {missing}")
|
|
83
|
+
if unexpected:
|
|
84
|
+
reasons.append(f"Unexpected tools: {unexpected}")
|
|
85
|
+
|
|
86
|
+
return self._result(score, "\n".join(reasons))
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class ToolArgumentAccuracy(Evaluator):
|
|
90
|
+
"""
|
|
91
|
+
Evaluates whether tool arguments were correct and well-formed.
|
|
92
|
+
Uses LLM judge to assess argument quality since exact matching is too rigid.
|
|
93
|
+
Requires case.agent_trace.
|
|
94
|
+
"""
|
|
95
|
+
name = "tool_argument_accuracy"
|
|
96
|
+
|
|
97
|
+
def __init__(self, threshold: float = 0.7):
|
|
98
|
+
super().__init__(threshold)
|
|
99
|
+
|
|
100
|
+
def evaluate(self, case: EvalCase, output: str) -> EvalResult:
|
|
101
|
+
if not case.agent_trace:
|
|
102
|
+
return self._result(0.0, "No agent_trace provided")
|
|
103
|
+
|
|
104
|
+
all_tool_calls = [tc for step in case.agent_trace for tc in step.tool_calls]
|
|
105
|
+
if not all_tool_calls:
|
|
106
|
+
return self._result(1.0, "No tool calls in trace")
|
|
107
|
+
|
|
108
|
+
results, reasons = [], []
|
|
109
|
+
for tc in all_tool_calls[:8]:
|
|
110
|
+
args_str = json.dumps(tc.arguments, indent=2) if tc.arguments else "{}"
|
|
111
|
+
prompt = (
|
|
112
|
+
f"Task: {case.input}\n\n"
|
|
113
|
+
f"Tool called: {tc.name}\n"
|
|
114
|
+
f"Arguments provided:\n{args_str}\n\n"
|
|
115
|
+
f"Are these arguments appropriate and well-formed for the tool '{tc.name}' given the task?"
|
|
116
|
+
f"\nAnswer \"Yes\" or \"No\"."
|
|
117
|
+
)
|
|
118
|
+
try:
|
|
119
|
+
answer = _judge_call(prompt, max_tokens=10)
|
|
120
|
+
good = _parse_yes_no(answer)
|
|
121
|
+
results.append(good)
|
|
122
|
+
reasons.append(f"{'✓' if good else '✗'} {tc.name}({args_str[:60]})")
|
|
123
|
+
except Exception as e:
|
|
124
|
+
results.append(False)
|
|
125
|
+
reasons.append(f"✗ {tc.name} (error: {e})")
|
|
126
|
+
|
|
127
|
+
score = sum(results) / len(results) if results else 0.0
|
|
128
|
+
return self._result(score, "\n".join(reasons))
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class PlanQuality(Evaluator):
|
|
132
|
+
"""
|
|
133
|
+
Evaluates whether the agent's plan is logical, complete, and efficient.
|
|
134
|
+
Looks at the sequence of steps and tool calls as a whole.
|
|
135
|
+
Requires case.agent_trace.
|
|
136
|
+
"""
|
|
137
|
+
name = "plan_quality"
|
|
138
|
+
|
|
139
|
+
def __init__(self, threshold: float = 0.7):
|
|
140
|
+
super().__init__(threshold)
|
|
141
|
+
|
|
142
|
+
def evaluate(self, case: EvalCase, output: str) -> EvalResult:
|
|
143
|
+
if not case.agent_trace:
|
|
144
|
+
return self._result(0.0, "No agent_trace provided")
|
|
145
|
+
|
|
146
|
+
trace = _trace_str(case.agent_trace)
|
|
147
|
+
ctx = f"Task: {case.input}\n\nAgent execution trace:\n{trace}\n\nFinal output: {output}"
|
|
148
|
+
questions = [
|
|
149
|
+
("Does the agent's plan address all aspects of the task?", True),
|
|
150
|
+
("Are the steps in the agent's plan in a logical order?", True),
|
|
151
|
+
("Does the agent avoid redundant or unnecessary steps?", True),
|
|
152
|
+
("Does each step in the plan follow logically from the previous one?", True),
|
|
153
|
+
("Would an expert consider this plan efficient for the task?", True),
|
|
154
|
+
]
|
|
155
|
+
score, reasons = _qag_eval(questions, ctx)
|
|
156
|
+
return self._result(score, "\n".join(reasons))
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class TaskCompletion(Evaluator):
|
|
160
|
+
"""
|
|
161
|
+
Evaluates whether the agent successfully completed the given task.
|
|
162
|
+
Assesses the final output against the task goal — not just the process.
|
|
163
|
+
"""
|
|
164
|
+
name = "task_completion"
|
|
165
|
+
|
|
166
|
+
def __init__(self, threshold: float = 0.7):
|
|
167
|
+
super().__init__(threshold)
|
|
168
|
+
|
|
169
|
+
def evaluate(self, case: EvalCase, output: str) -> EvalResult:
|
|
170
|
+
trace_str = ""
|
|
171
|
+
if case.agent_trace:
|
|
172
|
+
trace_str = f"\n\nAgent trace summary:\n{_trace_str(case.agent_trace)}"
|
|
173
|
+
|
|
174
|
+
ctx = f"Task: {case.input}{trace_str}\n\nFinal output: {output}"
|
|
175
|
+
questions = [
|
|
176
|
+
("Does the final output successfully complete the task?", True),
|
|
177
|
+
("Does the final output address all requirements of the task?", True),
|
|
178
|
+
("Is the final output a complete response (not partial or cut off)?", True),
|
|
179
|
+
("Did the agent fail to complete the task or produce an error?", False),
|
|
180
|
+
]
|
|
181
|
+
score, reasons = _qag_eval(questions, ctx)
|
|
182
|
+
return self._result(score, "\n".join(reasons))
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
class StepFaithfulness(Evaluator):
|
|
186
|
+
"""
|
|
187
|
+
Evaluates whether each agent step faithfully follows from the task and prior steps.
|
|
188
|
+
Detects hallucinated reasoning or steps that contradict the task.
|
|
189
|
+
Requires case.agent_trace.
|
|
190
|
+
"""
|
|
191
|
+
name = "step_faithfulness"
|
|
192
|
+
|
|
193
|
+
def __init__(self, threshold: float = 0.7):
|
|
194
|
+
super().__init__(threshold)
|
|
195
|
+
|
|
196
|
+
def evaluate(self, case: EvalCase, output: str) -> EvalResult:
|
|
197
|
+
if not case.agent_trace:
|
|
198
|
+
return self._result(0.0, "No agent_trace provided")
|
|
199
|
+
|
|
200
|
+
results, reasons = [], []
|
|
201
|
+
for i, step in enumerate(case.agent_trace[:8], 1):
|
|
202
|
+
prior = _trace_str(case.agent_trace[:i-1]) if i > 1 else "(no prior steps)"
|
|
203
|
+
step_str = _trace_str([step])
|
|
204
|
+
prompt = (
|
|
205
|
+
f"Task: {case.input}\n\n"
|
|
206
|
+
f"Prior steps:\n{prior}\n\n"
|
|
207
|
+
f"Current step {i}:\n{step_str}\n\n"
|
|
208
|
+
f"Does this step follow logically from the task and prior steps, "
|
|
209
|
+
f"without introducing contradictions or hallucinated information?"
|
|
210
|
+
f"\nAnswer \"Yes\" or \"No\"."
|
|
211
|
+
)
|
|
212
|
+
try:
|
|
213
|
+
answer = _judge_call(prompt, max_tokens=10)
|
|
214
|
+
faithful = _parse_yes_no(answer)
|
|
215
|
+
results.append(faithful)
|
|
216
|
+
thought_preview = step.thought[:60] if step.thought else "(no thought)"
|
|
217
|
+
reasons.append(f"{'✓' if faithful else '✗'} Step {i}: {thought_preview}")
|
|
218
|
+
except Exception as e:
|
|
219
|
+
results.append(False)
|
|
220
|
+
reasons.append(f"✗ Step {i} (error: {e})")
|
|
221
|
+
|
|
222
|
+
score = sum(results) / len(results) if results else 0.0
|
|
223
|
+
return self._result(score, f"{sum(results)}/{len(results)} steps faithful\n" + "\n".join(reasons))
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from ..case import EvalCase
|
|
4
|
+
from ..result import EvalResult
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Evaluator(ABC):
|
|
8
|
+
"""Base class for all evaluators."""
|
|
9
|
+
|
|
10
|
+
name: str = "evaluator"
|
|
11
|
+
threshold: float = 0.5
|
|
12
|
+
|
|
13
|
+
def __init__(self, threshold: float = 0.5):
|
|
14
|
+
self.threshold = threshold
|
|
15
|
+
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def evaluate(self, case: EvalCase, output: str) -> EvalResult:
|
|
18
|
+
...
|
|
19
|
+
|
|
20
|
+
def _result(self, score: float, reason: str = "", **metadata) -> EvalResult:
|
|
21
|
+
return EvalResult(
|
|
22
|
+
evaluator=self.name,
|
|
23
|
+
score=score,
|
|
24
|
+
passed=score >= self.threshold,
|
|
25
|
+
reason=reason,
|
|
26
|
+
metadata=metadata,
|
|
27
|
+
)
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Conversation evaluators — evaluate multi-turn dialogue quality.
|
|
3
|
+
|
|
4
|
+
These evaluators assess the quality of conversational AI systems
|
|
5
|
+
over the full dialogue, not just a single response.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from .base import Evaluator
|
|
10
|
+
from .llm_judge import _judge_call, _parse_yes_no, _qag_eval
|
|
11
|
+
from ..case import EvalCase
|
|
12
|
+
from ..result import EvalResult
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ConversationRelevance(Evaluator):
|
|
16
|
+
"""
|
|
17
|
+
Evaluates whether each assistant turn is relevant to the ongoing conversation.
|
|
18
|
+
Detects responses that ignore prior context or change subject unexpectedly.
|
|
19
|
+
Requires case.conversation.
|
|
20
|
+
"""
|
|
21
|
+
name = "conversation_relevance"
|
|
22
|
+
|
|
23
|
+
def __init__(self, threshold: float = 0.7):
|
|
24
|
+
super().__init__(threshold)
|
|
25
|
+
|
|
26
|
+
def evaluate(self, case: EvalCase, output: str) -> EvalResult:
|
|
27
|
+
if not case.conversation:
|
|
28
|
+
return self._result(0.0, "No conversation provided")
|
|
29
|
+
|
|
30
|
+
ctx = f"Conversation history:\n{case.conversation_str()}\n\nLatest response: {output}"
|
|
31
|
+
questions = [
|
|
32
|
+
("Is the latest response relevant to the conversation history?", True),
|
|
33
|
+
("Does the response address what the user was asking or discussing?", True),
|
|
34
|
+
("Does the response ignore important context from earlier in the conversation?", False),
|
|
35
|
+
("Does the response follow naturally from the preceding turns?", True),
|
|
36
|
+
]
|
|
37
|
+
score, reasons = _qag_eval(questions, ctx)
|
|
38
|
+
return self._result(score, "\n".join(reasons))
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class KnowledgeRetention(Evaluator):
|
|
42
|
+
"""
|
|
43
|
+
Evaluates whether the model retains and correctly uses facts established
|
|
44
|
+
earlier in the conversation.
|
|
45
|
+
|
|
46
|
+
Requires case.conversation — checks that the final response (output)
|
|
47
|
+
is consistent with facts introduced in earlier turns.
|
|
48
|
+
"""
|
|
49
|
+
name = "knowledge_retention"
|
|
50
|
+
|
|
51
|
+
def __init__(self, threshold: float = 0.7):
|
|
52
|
+
super().__init__(threshold)
|
|
53
|
+
|
|
54
|
+
def evaluate(self, case: EvalCase, output: str) -> EvalResult:
|
|
55
|
+
if not case.conversation:
|
|
56
|
+
return self._result(0.0, "No conversation provided")
|
|
57
|
+
|
|
58
|
+
# Extract facts from user turns earlier in conversation
|
|
59
|
+
user_turns = [m["content"] for m in case.conversation if m["role"] == "user"]
|
|
60
|
+
if not user_turns:
|
|
61
|
+
return self._result(1.0, "No user turns to retain facts from")
|
|
62
|
+
|
|
63
|
+
ctx = (
|
|
64
|
+
f"Conversation history:\n{case.conversation_str()}\n\n"
|
|
65
|
+
f"Latest response:\n{output}"
|
|
66
|
+
)
|
|
67
|
+
questions = [
|
|
68
|
+
("Does the response correctly recall facts mentioned by the user earlier in the conversation?", True),
|
|
69
|
+
("Does the response contradict information the user provided in a prior turn?", False),
|
|
70
|
+
("Does the response show awareness of the user's preferences or context established earlier?", True),
|
|
71
|
+
]
|
|
72
|
+
score, reasons = _qag_eval(questions, ctx)
|
|
73
|
+
return self._result(score, "\n".join(reasons))
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class ConversationCompleteness(Evaluator):
|
|
77
|
+
"""
|
|
78
|
+
Evaluates whether the assistant fully resolved the user's goals
|
|
79
|
+
over the course of the conversation.
|
|
80
|
+
|
|
81
|
+
Assesses the final output as the culmination of a multi-turn dialogue.
|
|
82
|
+
Requires case.conversation.
|
|
83
|
+
"""
|
|
84
|
+
name = "conversation_completeness"
|
|
85
|
+
|
|
86
|
+
def __init__(self, threshold: float = 0.7):
|
|
87
|
+
super().__init__(threshold)
|
|
88
|
+
|
|
89
|
+
def evaluate(self, case: EvalCase, output: str) -> EvalResult:
|
|
90
|
+
if not case.conversation:
|
|
91
|
+
return self._result(0.0, "No conversation provided")
|
|
92
|
+
|
|
93
|
+
# Infer the user's original goal from first user turn
|
|
94
|
+
first_user = next(
|
|
95
|
+
(m["content"] for m in case.conversation if m["role"] == "user"), case.input
|
|
96
|
+
)
|
|
97
|
+
ctx = (
|
|
98
|
+
f"Original user goal: {first_user}\n\n"
|
|
99
|
+
f"Full conversation:\n{case.conversation_str()}\n\n"
|
|
100
|
+
f"Final response:\n{output}"
|
|
101
|
+
)
|
|
102
|
+
questions = [
|
|
103
|
+
("By the end of the conversation, has the user's original goal been fully addressed?", True),
|
|
104
|
+
("Has the assistant left important questions from the user unanswered?", False),
|
|
105
|
+
("Does the final response bring the conversation to a satisfying resolution?", True),
|
|
106
|
+
("Would the user need to ask follow-up questions to get what they originally wanted?", False),
|
|
107
|
+
]
|
|
108
|
+
score, reasons = _qag_eval(questions, ctx)
|
|
109
|
+
return self._result(score, "\n".join(reasons))
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class TurnConsistency(Evaluator):
|
|
113
|
+
"""
|
|
114
|
+
Checks that the assistant doesn't contradict itself across turns.
|
|
115
|
+
Requires case.conversation.
|
|
116
|
+
"""
|
|
117
|
+
name = "turn_consistency"
|
|
118
|
+
|
|
119
|
+
def __init__(self, threshold: float = 0.8):
|
|
120
|
+
super().__init__(threshold)
|
|
121
|
+
|
|
122
|
+
def evaluate(self, case: EvalCase, output: str) -> EvalResult:
|
|
123
|
+
if not case.conversation:
|
|
124
|
+
return self._result(0.0, "No conversation provided")
|
|
125
|
+
|
|
126
|
+
ctx = (
|
|
127
|
+
f"Conversation:\n{case.conversation_str()}\n\n"
|
|
128
|
+
f"Latest response:\n{output}"
|
|
129
|
+
)
|
|
130
|
+
questions = [
|
|
131
|
+
("Is the latest response consistent with all prior assistant responses in the conversation?", True),
|
|
132
|
+
("Does the assistant contradict something it said in a previous turn?", False),
|
|
133
|
+
("Does the assistant maintain a consistent persona and tone throughout?", True),
|
|
134
|
+
]
|
|
135
|
+
score, reasons = _qag_eval(questions, ctx)
|
|
136
|
+
return self._result(score, "\n".join(reasons))
|