agno 2.3.8__py3-none-any.whl → 2.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/agent/agent.py +134 -82
- agno/db/mysql/__init__.py +2 -1
- agno/db/mysql/async_mysql.py +2888 -0
- agno/db/mysql/mysql.py +17 -8
- agno/db/mysql/utils.py +139 -6
- agno/db/postgres/async_postgres.py +10 -5
- agno/db/postgres/postgres.py +7 -2
- agno/db/schemas/evals.py +1 -0
- agno/db/singlestore/singlestore.py +5 -1
- agno/db/sqlite/async_sqlite.py +2 -2
- agno/eval/__init__.py +10 -0
- agno/eval/agent_as_judge.py +860 -0
- agno/eval/base.py +29 -0
- agno/eval/utils.py +2 -1
- agno/exceptions.py +7 -0
- agno/knowledge/embedder/openai.py +8 -8
- agno/knowledge/knowledge.py +1142 -176
- agno/media.py +22 -6
- agno/models/aws/claude.py +8 -7
- agno/models/base.py +27 -1
- agno/models/deepseek/deepseek.py +67 -0
- agno/models/google/gemini.py +65 -11
- agno/models/google/utils.py +22 -0
- agno/models/message.py +2 -0
- agno/models/openai/chat.py +4 -0
- agno/os/app.py +64 -74
- agno/os/interfaces/a2a/router.py +3 -4
- agno/os/interfaces/agui/router.py +2 -0
- agno/os/router.py +3 -1607
- agno/os/routers/agents/__init__.py +3 -0
- agno/os/routers/agents/router.py +581 -0
- agno/os/routers/agents/schema.py +261 -0
- agno/os/routers/evals/evals.py +26 -6
- agno/os/routers/evals/schemas.py +34 -2
- agno/os/routers/evals/utils.py +101 -20
- agno/os/routers/knowledge/knowledge.py +1 -1
- agno/os/routers/teams/__init__.py +3 -0
- agno/os/routers/teams/router.py +496 -0
- agno/os/routers/teams/schema.py +257 -0
- agno/os/routers/workflows/__init__.py +3 -0
- agno/os/routers/workflows/router.py +545 -0
- agno/os/routers/workflows/schema.py +75 -0
- agno/os/schema.py +1 -559
- agno/os/utils.py +139 -2
- agno/team/team.py +73 -16
- agno/tools/file_generation.py +12 -6
- agno/tools/firecrawl.py +15 -7
- agno/utils/hooks.py +64 -5
- agno/utils/http.py +2 -2
- agno/utils/media.py +11 -1
- agno/utils/print_response/agent.py +8 -0
- agno/utils/print_response/team.py +8 -0
- agno/vectordb/pgvector/pgvector.py +88 -51
- agno/workflow/parallel.py +3 -3
- agno/workflow/step.py +14 -2
- agno/workflow/types.py +38 -2
- agno/workflow/workflow.py +12 -4
- {agno-2.3.8.dist-info → agno-2.3.9.dist-info}/METADATA +7 -2
- {agno-2.3.8.dist-info → agno-2.3.9.dist-info}/RECORD +62 -49
- {agno-2.3.8.dist-info → agno-2.3.9.dist-info}/WHEEL +0 -0
- {agno-2.3.8.dist-info → agno-2.3.9.dist-info}/licenses/LICENSE +0 -0
- {agno-2.3.8.dist-info → agno-2.3.9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,860 @@
|
|
|
1
|
+
from dataclasses import asdict, dataclass, field
|
|
2
|
+
from inspect import iscoroutinefunction
|
|
3
|
+
from os import getenv
|
|
4
|
+
from textwrap import dedent
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Union
|
|
6
|
+
from uuid import uuid4
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
from agno.agent import Agent
|
|
11
|
+
from agno.db.base import AsyncBaseDb, BaseDb
|
|
12
|
+
from agno.db.schemas.evals import EvalType
|
|
13
|
+
from agno.eval.base import BaseEval
|
|
14
|
+
from agno.eval.utils import async_log_eval, log_eval_run, store_result_in_file
|
|
15
|
+
from agno.exceptions import EvalError
|
|
16
|
+
from agno.models.base import Model
|
|
17
|
+
from agno.run.agent import RunInput, RunOutput
|
|
18
|
+
from agno.run.team import TeamRunInput, TeamRunOutput
|
|
19
|
+
from agno.utils.log import log_warning, logger, set_log_level_to_debug, set_log_level_to_info
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from rich.console import Console
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class NumericJudgeResponse(BaseModel):
|
|
26
|
+
"""Response schema for numeric scoring mode."""
|
|
27
|
+
|
|
28
|
+
score: int = Field(..., ge=1, le=10, description="Score between 1 and 10.")
|
|
29
|
+
reason: str = Field(..., description="Detailed reasoning for the evaluation.")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class BinaryJudgeResponse(BaseModel):
|
|
33
|
+
"""Response schema for binary scoring mode."""
|
|
34
|
+
|
|
35
|
+
passed: bool = Field(..., description="Pass/fail result.")
|
|
36
|
+
reason: str = Field(..., description="Detailed reasoning for the evaluation.")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class AgentAsJudgeEvaluation:
|
|
41
|
+
"""Result of a single agent-as-judge evaluation."""
|
|
42
|
+
|
|
43
|
+
input: str
|
|
44
|
+
output: str
|
|
45
|
+
criteria: str
|
|
46
|
+
score: Optional[int]
|
|
47
|
+
reason: str
|
|
48
|
+
passed: bool
|
|
49
|
+
|
|
50
|
+
def print_eval(self, console: Optional["Console"] = None):
|
|
51
|
+
from rich.box import ROUNDED
|
|
52
|
+
from rich.console import Console
|
|
53
|
+
from rich.markdown import Markdown
|
|
54
|
+
from rich.table import Table
|
|
55
|
+
|
|
56
|
+
if console is None:
|
|
57
|
+
console = Console()
|
|
58
|
+
|
|
59
|
+
status_style = "green" if self.passed else "red"
|
|
60
|
+
status_text = "PASSED" if self.passed else "FAILED"
|
|
61
|
+
|
|
62
|
+
results_table = Table(
|
|
63
|
+
box=ROUNDED,
|
|
64
|
+
border_style="blue",
|
|
65
|
+
show_header=False,
|
|
66
|
+
title="[ Agent As Judge Evaluation ]",
|
|
67
|
+
title_style="bold sky_blue1",
|
|
68
|
+
title_justify="center",
|
|
69
|
+
)
|
|
70
|
+
results_table.add_row("Input", self.input[:200] + "..." if len(self.input) > 200 else self.input)
|
|
71
|
+
results_table.add_row("Output", self.output[:200] + "..." if len(self.output) > 200 else self.output)
|
|
72
|
+
if self.score is not None:
|
|
73
|
+
results_table.add_row("Score", f"{self.score}/10")
|
|
74
|
+
results_table.add_row("Status", f"[{status_style}]{status_text}[/{status_style}]")
|
|
75
|
+
results_table.add_row("Reason", Markdown(self.reason))
|
|
76
|
+
console.print(results_table)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass
|
|
80
|
+
class AgentAsJudgeResult:
|
|
81
|
+
"""Aggregated results from agent-as-judge evaluations."""
|
|
82
|
+
|
|
83
|
+
run_id: str
|
|
84
|
+
results: List[AgentAsJudgeEvaluation] = field(default_factory=list)
|
|
85
|
+
avg_score: Optional[float] = field(init=False)
|
|
86
|
+
min_score: Optional[float] = field(init=False)
|
|
87
|
+
max_score: Optional[float] = field(init=False)
|
|
88
|
+
std_dev_score: Optional[float] = field(init=False)
|
|
89
|
+
pass_rate: float = field(init=False)
|
|
90
|
+
|
|
91
|
+
def __post_init__(self):
|
|
92
|
+
self.compute_stats()
|
|
93
|
+
|
|
94
|
+
def compute_stats(self):
|
|
95
|
+
import statistics
|
|
96
|
+
|
|
97
|
+
if self.results and len(self.results) > 0:
|
|
98
|
+
passed = [r.passed for r in self.results]
|
|
99
|
+
self.pass_rate = sum(passed) / len(passed) * 100
|
|
100
|
+
|
|
101
|
+
# Compute score statistics only for numeric mode (where score is not None)
|
|
102
|
+
scores = [r.score for r in self.results if r.score is not None]
|
|
103
|
+
if scores:
|
|
104
|
+
self.avg_score = statistics.mean(scores)
|
|
105
|
+
self.min_score = min(scores)
|
|
106
|
+
self.max_score = max(scores)
|
|
107
|
+
self.std_dev_score = statistics.stdev(scores) if len(scores) > 1 else 0.0
|
|
108
|
+
else:
|
|
109
|
+
# Binary mode - no scores
|
|
110
|
+
self.avg_score = None
|
|
111
|
+
self.min_score = None
|
|
112
|
+
self.max_score = None
|
|
113
|
+
self.std_dev_score = None
|
|
114
|
+
else:
|
|
115
|
+
self.avg_score = None
|
|
116
|
+
self.min_score = None
|
|
117
|
+
self.max_score = None
|
|
118
|
+
self.std_dev_score = None
|
|
119
|
+
self.pass_rate = 0.0
|
|
120
|
+
|
|
121
|
+
def print_summary(self, console: Optional["Console"] = None):
|
|
122
|
+
from rich.box import ROUNDED
|
|
123
|
+
from rich.console import Console
|
|
124
|
+
from rich.table import Table
|
|
125
|
+
|
|
126
|
+
if console is None:
|
|
127
|
+
console = Console()
|
|
128
|
+
|
|
129
|
+
summary_table = Table(
|
|
130
|
+
box=ROUNDED,
|
|
131
|
+
border_style="blue",
|
|
132
|
+
show_header=False,
|
|
133
|
+
title="[ Agent As Judge Evaluation Summary ]",
|
|
134
|
+
title_style="bold sky_blue1",
|
|
135
|
+
title_justify="center",
|
|
136
|
+
padding=(0, 2), # Add horizontal padding to make table wider
|
|
137
|
+
min_width=45, # Ensure table is wide enough for title
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
num_results = len(self.results)
|
|
141
|
+
summary_table.add_row("Number of Evaluations", f"{num_results}")
|
|
142
|
+
summary_table.add_row("Pass Rate", f"{self.pass_rate:.1f}%")
|
|
143
|
+
|
|
144
|
+
# Only show score statistics for numeric mode (when scores exist)
|
|
145
|
+
if self.avg_score is not None:
|
|
146
|
+
# For single evaluation, show "Score" instead of statistics
|
|
147
|
+
if num_results == 1:
|
|
148
|
+
summary_table.add_row("Score", f"{self.avg_score:.2f}/10")
|
|
149
|
+
# For multiple evaluations, show full statistics
|
|
150
|
+
elif num_results > 1:
|
|
151
|
+
summary_table.add_row("Average Score", f"{self.avg_score:.2f}/10")
|
|
152
|
+
summary_table.add_row("Min Score", f"{self.min_score:.2f}/10")
|
|
153
|
+
summary_table.add_row("Max Score", f"{self.max_score:.2f}/10")
|
|
154
|
+
if self.std_dev_score and self.std_dev_score > 0:
|
|
155
|
+
summary_table.add_row("Std Deviation", f"{self.std_dev_score:.2f}")
|
|
156
|
+
|
|
157
|
+
console.print(summary_table)
|
|
158
|
+
|
|
159
|
+
def print_results(self, console: Optional["Console"] = None):
|
|
160
|
+
for result in self.results:
|
|
161
|
+
result.print_eval(console)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
@dataclass
|
|
165
|
+
class AgentAsJudgeEval(BaseEval):
|
|
166
|
+
"""Evaluate agent outputs using custom criteria with an LLM judge."""
|
|
167
|
+
|
|
168
|
+
# Core evaluation fields
|
|
169
|
+
criteria: str = ""
|
|
170
|
+
scoring_strategy: Literal["numeric", "binary"] = "binary"
|
|
171
|
+
threshold: int = 7 # Only used for numeric strategy
|
|
172
|
+
on_fail: Optional[Callable[["AgentAsJudgeEvaluation"], None]] = None
|
|
173
|
+
additional_guidelines: Optional[Union[str, List[str]]] = None
|
|
174
|
+
|
|
175
|
+
# Evaluation metadata
|
|
176
|
+
name: Optional[str] = None
|
|
177
|
+
|
|
178
|
+
# Model configuration
|
|
179
|
+
model: Optional[Model] = None
|
|
180
|
+
evaluator_agent: Optional[Agent] = None
|
|
181
|
+
|
|
182
|
+
# Output options
|
|
183
|
+
print_summary: bool = False
|
|
184
|
+
print_results: bool = False
|
|
185
|
+
file_path_to_save_results: Optional[str] = None
|
|
186
|
+
debug_mode: bool = getenv("AGNO_DEBUG", "false").lower() == "true"
|
|
187
|
+
db: Optional[Union[BaseDb, AsyncBaseDb]] = None
|
|
188
|
+
telemetry: bool = True
|
|
189
|
+
run_in_background: bool = False
|
|
190
|
+
|
|
191
|
+
def __post_init__(self):
|
|
192
|
+
"""Validate scoring_strategy and threshold."""
|
|
193
|
+
if self.scoring_strategy == "numeric" and not 1 <= self.threshold <= 10:
|
|
194
|
+
raise ValueError(f"threshold must be between 1 and 10, got {self.threshold}")
|
|
195
|
+
|
|
196
|
+
def get_evaluator_agent(self) -> Agent:
|
|
197
|
+
"""Return the evaluator agent. If not provided, build it based on the model and criteria."""
|
|
198
|
+
# Select response schema based on scoring strategy
|
|
199
|
+
response_schema = NumericJudgeResponse if self.scoring_strategy == "numeric" else BinaryJudgeResponse
|
|
200
|
+
|
|
201
|
+
if self.evaluator_agent is not None:
|
|
202
|
+
# Ensure custom evaluator has the required output_schema for structured responses
|
|
203
|
+
self.evaluator_agent.output_schema = response_schema
|
|
204
|
+
return self.evaluator_agent
|
|
205
|
+
|
|
206
|
+
model = self.model
|
|
207
|
+
if model is None:
|
|
208
|
+
try:
|
|
209
|
+
from agno.models.openai import OpenAIChat
|
|
210
|
+
|
|
211
|
+
model = OpenAIChat(id="gpt-5-mini")
|
|
212
|
+
except (ModuleNotFoundError, ImportError) as e:
|
|
213
|
+
logger.exception(e)
|
|
214
|
+
raise EvalError(
|
|
215
|
+
"Agno uses `openai` as the default model provider. Please run `pip install openai` to use the default evaluator."
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
# Build instructions based on scoring strategy
|
|
219
|
+
instructions_parts = ["## Criteria", self.criteria, ""]
|
|
220
|
+
|
|
221
|
+
if self.scoring_strategy == "numeric":
|
|
222
|
+
instructions_parts.extend(
|
|
223
|
+
[
|
|
224
|
+
"## Scoring (1-10)",
|
|
225
|
+
"- 1-2: Completely fails the criteria",
|
|
226
|
+
"- 3-4: Major issues",
|
|
227
|
+
"- 5-6: Partial success with significant issues",
|
|
228
|
+
"- 7-8: Mostly meets criteria with minor issues",
|
|
229
|
+
"- 9-10: Fully meets or exceeds criteria",
|
|
230
|
+
"",
|
|
231
|
+
"## Instructions",
|
|
232
|
+
"1. Carefully evaluate the output against the criteria above",
|
|
233
|
+
"2. Provide a score from 1-10",
|
|
234
|
+
"3. Provide detailed reasoning that references specific parts of the output",
|
|
235
|
+
]
|
|
236
|
+
)
|
|
237
|
+
else: # binary
|
|
238
|
+
instructions_parts.extend(
|
|
239
|
+
[
|
|
240
|
+
"## Evaluation",
|
|
241
|
+
"Determine if the output PASSES or FAILS the criteria above.",
|
|
242
|
+
"",
|
|
243
|
+
"## Instructions",
|
|
244
|
+
"1. Carefully evaluate the output against the criteria above",
|
|
245
|
+
"2. Decide if it passes (true) or fails (false)",
|
|
246
|
+
"3. Provide detailed reasoning that references specific parts of the output",
|
|
247
|
+
]
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
# Add additional guidelines if provided
|
|
251
|
+
if self.additional_guidelines:
|
|
252
|
+
instructions_parts.append("")
|
|
253
|
+
instructions_parts.append("## Additional Guidelines")
|
|
254
|
+
if isinstance(self.additional_guidelines, str):
|
|
255
|
+
instructions_parts.append(self.additional_guidelines)
|
|
256
|
+
else:
|
|
257
|
+
for guideline in self.additional_guidelines:
|
|
258
|
+
instructions_parts.append(f"- {guideline}")
|
|
259
|
+
|
|
260
|
+
# Add closing instruction
|
|
261
|
+
instructions_parts.append("")
|
|
262
|
+
instructions_parts.append("Be objective and thorough in your evaluation.")
|
|
263
|
+
|
|
264
|
+
return Agent(
|
|
265
|
+
model=model,
|
|
266
|
+
description="You are an expert evaluator. Score outputs objectively based on the provided criteria.",
|
|
267
|
+
instructions="\n".join(instructions_parts),
|
|
268
|
+
output_schema=response_schema,
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
def _evaluate(self, input: str, output: str, evaluator_agent: Agent) -> Optional[AgentAsJudgeEvaluation]:
|
|
272
|
+
"""Evaluate a single input/output pair."""
|
|
273
|
+
try:
|
|
274
|
+
prompt = dedent(f"""\
|
|
275
|
+
<input>
|
|
276
|
+
{input}
|
|
277
|
+
</input>
|
|
278
|
+
|
|
279
|
+
<output>
|
|
280
|
+
{output}
|
|
281
|
+
</output>
|
|
282
|
+
""")
|
|
283
|
+
|
|
284
|
+
response = evaluator_agent.run(prompt).content
|
|
285
|
+
if not isinstance(response, (NumericJudgeResponse, BinaryJudgeResponse)):
|
|
286
|
+
raise EvalError(f"Invalid response: {response}")
|
|
287
|
+
|
|
288
|
+
# Determine pass/fail based on scoring strategy and response type
|
|
289
|
+
if isinstance(response, NumericJudgeResponse):
|
|
290
|
+
score = response.score
|
|
291
|
+
passed = score >= self.threshold
|
|
292
|
+
else: # BinaryJudgeResponse
|
|
293
|
+
score = None
|
|
294
|
+
passed = response.passed
|
|
295
|
+
|
|
296
|
+
evaluation = AgentAsJudgeEvaluation(
|
|
297
|
+
input=input,
|
|
298
|
+
output=output,
|
|
299
|
+
criteria=self.criteria,
|
|
300
|
+
score=score,
|
|
301
|
+
reason=response.reason,
|
|
302
|
+
passed=passed,
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
# Trigger on_fail callback if evaluation failed
|
|
306
|
+
if not passed and self.on_fail:
|
|
307
|
+
try:
|
|
308
|
+
if iscoroutinefunction(self.on_fail):
|
|
309
|
+
log_warning(
|
|
310
|
+
f"Cannot use async on_fail callback with sync evaluation. Use arun() instead. Skipping callback: {self.on_fail.__name__}"
|
|
311
|
+
)
|
|
312
|
+
else:
|
|
313
|
+
self.on_fail(evaluation)
|
|
314
|
+
except Exception as e:
|
|
315
|
+
logger.warning(f"on_fail callback error: {e}")
|
|
316
|
+
|
|
317
|
+
return evaluation
|
|
318
|
+
except Exception as e:
|
|
319
|
+
logger.exception(f"Evaluation failed: {e}")
|
|
320
|
+
return None
|
|
321
|
+
|
|
322
|
+
async def _aevaluate(self, input: str, output: str, evaluator_agent: Agent) -> Optional[AgentAsJudgeEvaluation]:
|
|
323
|
+
"""Evaluate a single input/output pair asynchronously."""
|
|
324
|
+
try:
|
|
325
|
+
prompt = dedent(f"""\
|
|
326
|
+
<input>
|
|
327
|
+
{input}
|
|
328
|
+
</input>
|
|
329
|
+
|
|
330
|
+
<output>
|
|
331
|
+
{output}
|
|
332
|
+
</output>
|
|
333
|
+
""")
|
|
334
|
+
|
|
335
|
+
response = await evaluator_agent.arun(prompt)
|
|
336
|
+
judge_response = response.content
|
|
337
|
+
if not isinstance(judge_response, (NumericJudgeResponse, BinaryJudgeResponse)):
|
|
338
|
+
raise EvalError(f"Invalid response: {judge_response}")
|
|
339
|
+
|
|
340
|
+
# Determine pass/fail based on response type
|
|
341
|
+
if isinstance(judge_response, NumericJudgeResponse):
|
|
342
|
+
score = judge_response.score
|
|
343
|
+
passed = score >= self.threshold
|
|
344
|
+
else: # BinaryJudgeResponse
|
|
345
|
+
score = None
|
|
346
|
+
passed = judge_response.passed
|
|
347
|
+
|
|
348
|
+
evaluation = AgentAsJudgeEvaluation(
|
|
349
|
+
input=input,
|
|
350
|
+
output=output,
|
|
351
|
+
criteria=self.criteria,
|
|
352
|
+
score=score,
|
|
353
|
+
reason=judge_response.reason,
|
|
354
|
+
passed=passed,
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
# Trigger on_fail callback if evaluation failed
|
|
358
|
+
if not passed and self.on_fail:
|
|
359
|
+
try:
|
|
360
|
+
if iscoroutinefunction(self.on_fail):
|
|
361
|
+
await self.on_fail(evaluation)
|
|
362
|
+
else:
|
|
363
|
+
self.on_fail(evaluation)
|
|
364
|
+
except Exception as e:
|
|
365
|
+
logger.warning(f"on_fail callback error: {e}")
|
|
366
|
+
|
|
367
|
+
return evaluation
|
|
368
|
+
except Exception as e:
|
|
369
|
+
logger.exception(f"Async evaluation failed: {e}")
|
|
370
|
+
return None
|
|
371
|
+
|
|
372
|
+
def _log_eval_to_db(
|
|
373
|
+
self,
|
|
374
|
+
run_id: str,
|
|
375
|
+
result: AgentAsJudgeResult,
|
|
376
|
+
agent_id: Optional[str] = None,
|
|
377
|
+
model_id: Optional[str] = None,
|
|
378
|
+
model_provider: Optional[str] = None,
|
|
379
|
+
team_id: Optional[str] = None,
|
|
380
|
+
evaluated_component_name: Optional[str] = None,
|
|
381
|
+
) -> None:
|
|
382
|
+
"""Helper to log evaluation to database."""
|
|
383
|
+
if not self.db:
|
|
384
|
+
return
|
|
385
|
+
|
|
386
|
+
log_eval_run(
|
|
387
|
+
db=self.db, # type: ignore
|
|
388
|
+
run_id=run_id,
|
|
389
|
+
run_data=asdict(result),
|
|
390
|
+
eval_type=EvalType.AGENT_AS_JUDGE,
|
|
391
|
+
agent_id=agent_id,
|
|
392
|
+
model_id=model_id,
|
|
393
|
+
model_provider=model_provider,
|
|
394
|
+
name=self.name,
|
|
395
|
+
team_id=team_id,
|
|
396
|
+
evaluated_component_name=evaluated_component_name,
|
|
397
|
+
eval_input={
|
|
398
|
+
"criteria": self.criteria,
|
|
399
|
+
"scoring_strategy": self.scoring_strategy,
|
|
400
|
+
"threshold": self.threshold if self.scoring_strategy == "numeric" else None,
|
|
401
|
+
"additional_guidelines": self.additional_guidelines,
|
|
402
|
+
},
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
async def _async_log_eval_to_db(
|
|
406
|
+
self,
|
|
407
|
+
run_id: str,
|
|
408
|
+
result: AgentAsJudgeResult,
|
|
409
|
+
agent_id: Optional[str] = None,
|
|
410
|
+
model_id: Optional[str] = None,
|
|
411
|
+
model_provider: Optional[str] = None,
|
|
412
|
+
team_id: Optional[str] = None,
|
|
413
|
+
evaluated_component_name: Optional[str] = None,
|
|
414
|
+
) -> None:
|
|
415
|
+
"""Helper to log evaluation to database asynchronously."""
|
|
416
|
+
if not self.db:
|
|
417
|
+
return
|
|
418
|
+
|
|
419
|
+
await async_log_eval(
|
|
420
|
+
db=self.db,
|
|
421
|
+
run_id=run_id,
|
|
422
|
+
run_data=asdict(result),
|
|
423
|
+
eval_type=EvalType.AGENT_AS_JUDGE,
|
|
424
|
+
agent_id=agent_id,
|
|
425
|
+
model_id=model_id,
|
|
426
|
+
model_provider=model_provider,
|
|
427
|
+
name=self.name,
|
|
428
|
+
team_id=team_id,
|
|
429
|
+
evaluated_component_name=evaluated_component_name,
|
|
430
|
+
eval_input={
|
|
431
|
+
"criteria": self.criteria,
|
|
432
|
+
"scoring_strategy": self.scoring_strategy,
|
|
433
|
+
"threshold": self.threshold if self.scoring_strategy == "numeric" else None,
|
|
434
|
+
"additional_guidelines": self.additional_guidelines,
|
|
435
|
+
},
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
def run(
|
|
439
|
+
self,
|
|
440
|
+
*,
|
|
441
|
+
input: Optional[str] = None,
|
|
442
|
+
output: Optional[str] = None,
|
|
443
|
+
cases: Optional[List[Dict[str, str]]] = None,
|
|
444
|
+
print_summary: bool = False,
|
|
445
|
+
print_results: bool = False,
|
|
446
|
+
) -> Optional[AgentAsJudgeResult]:
|
|
447
|
+
"""Evaluate input/output against the criteria.
|
|
448
|
+
|
|
449
|
+
Supports both single evaluation and batch evaluation:
|
|
450
|
+
|
|
451
|
+
Args:
|
|
452
|
+
input: Input text for single evaluation
|
|
453
|
+
output: Output text for single evaluation
|
|
454
|
+
cases: List of input/output pairs for batch evaluation
|
|
455
|
+
print_summary: Whether to print summary
|
|
456
|
+
print_results: Whether to print detailed results
|
|
457
|
+
"""
|
|
458
|
+
# Generate unique run_id for this execution
|
|
459
|
+
run_id = str(uuid4())
|
|
460
|
+
|
|
461
|
+
# Validate parameters
|
|
462
|
+
single_mode = input is not None or output is not None
|
|
463
|
+
batch_mode = cases is not None
|
|
464
|
+
|
|
465
|
+
if single_mode and batch_mode:
|
|
466
|
+
raise ValueError("Provide either (input, output) OR cases, not both")
|
|
467
|
+
|
|
468
|
+
if not single_mode and not batch_mode:
|
|
469
|
+
raise ValueError("Must provide either (input, output) OR cases")
|
|
470
|
+
|
|
471
|
+
# Batch mode if cases provided
|
|
472
|
+
if batch_mode and cases is not None:
|
|
473
|
+
return self._run_batch(cases=cases, run_id=run_id, print_summary=print_summary, print_results=print_results)
|
|
474
|
+
|
|
475
|
+
# Validate single mode has both input and output
|
|
476
|
+
if input is None or output is None:
|
|
477
|
+
raise ValueError("Both input and output are required for single evaluation")
|
|
478
|
+
|
|
479
|
+
# Single evaluation logic
|
|
480
|
+
from rich.console import Console
|
|
481
|
+
from rich.live import Live
|
|
482
|
+
from rich.status import Status
|
|
483
|
+
|
|
484
|
+
if isinstance(self.db, AsyncBaseDb):
|
|
485
|
+
raise ValueError("Use arun() with async DB.")
|
|
486
|
+
|
|
487
|
+
set_log_level_to_debug() if self.debug_mode else set_log_level_to_info()
|
|
488
|
+
result = AgentAsJudgeResult(run_id=run_id)
|
|
489
|
+
|
|
490
|
+
console = Console()
|
|
491
|
+
with Live(console=console, transient=True) as live_log:
|
|
492
|
+
evaluator = self.get_evaluator_agent()
|
|
493
|
+
|
|
494
|
+
status = Status("Running evaluation...", spinner="dots", speed=1.0, refresh_per_second=10)
|
|
495
|
+
live_log.update(status)
|
|
496
|
+
|
|
497
|
+
evaluation = self._evaluate(input=input, output=output, evaluator_agent=evaluator)
|
|
498
|
+
|
|
499
|
+
if evaluation:
|
|
500
|
+
result.results.append(evaluation)
|
|
501
|
+
result.compute_stats()
|
|
502
|
+
|
|
503
|
+
status.stop()
|
|
504
|
+
|
|
505
|
+
# Save result to file
|
|
506
|
+
if self.file_path_to_save_results:
|
|
507
|
+
store_result_in_file(
|
|
508
|
+
file_path=self.file_path_to_save_results,
|
|
509
|
+
result=result,
|
|
510
|
+
eval_id=run_id,
|
|
511
|
+
name=self.name,
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
# Print results
|
|
515
|
+
if self.print_results or print_results:
|
|
516
|
+
result.print_results(console)
|
|
517
|
+
if self.print_summary or print_summary:
|
|
518
|
+
result.print_summary(console)
|
|
519
|
+
|
|
520
|
+
# Log to DB
|
|
521
|
+
self._log_eval_to_db(run_id=run_id, result=result)
|
|
522
|
+
|
|
523
|
+
if self.telemetry:
|
|
524
|
+
from agno.api.evals import EvalRunCreate, create_eval_run_telemetry
|
|
525
|
+
|
|
526
|
+
create_eval_run_telemetry(
|
|
527
|
+
eval_run=EvalRunCreate(
|
|
528
|
+
run_id=run_id, eval_type=EvalType.AGENT_AS_JUDGE, data=self._get_telemetry_data(result)
|
|
529
|
+
)
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
return result
|
|
533
|
+
|
|
534
|
+
async def arun(
|
|
535
|
+
self,
|
|
536
|
+
*,
|
|
537
|
+
input: Optional[str] = None,
|
|
538
|
+
output: Optional[str] = None,
|
|
539
|
+
cases: Optional[List[Dict[str, str]]] = None,
|
|
540
|
+
print_summary: bool = False,
|
|
541
|
+
print_results: bool = False,
|
|
542
|
+
) -> Optional[AgentAsJudgeResult]:
|
|
543
|
+
"""Evaluate input/output against the criteria asynchronously.
|
|
544
|
+
|
|
545
|
+
Supports both single evaluation and batch evaluation:
|
|
546
|
+
|
|
547
|
+
Args:
|
|
548
|
+
input: Input text for single evaluation
|
|
549
|
+
output: Output text for single evaluation
|
|
550
|
+
cases: List of input/output pairs for batch evaluation
|
|
551
|
+
print_summary: Whether to print summary
|
|
552
|
+
print_results: Whether to print detailed results
|
|
553
|
+
"""
|
|
554
|
+
# Generate unique run_id for this execution
|
|
555
|
+
run_id = str(uuid4())
|
|
556
|
+
|
|
557
|
+
# Validate parameters
|
|
558
|
+
single_mode = input is not None or output is not None
|
|
559
|
+
batch_mode = cases is not None
|
|
560
|
+
|
|
561
|
+
if single_mode and batch_mode:
|
|
562
|
+
raise ValueError("Provide either (input, output) OR cases, not both")
|
|
563
|
+
|
|
564
|
+
if not single_mode and not batch_mode:
|
|
565
|
+
raise ValueError("Must provide either (input, output) OR cases")
|
|
566
|
+
|
|
567
|
+
# Batch mode if cases provided
|
|
568
|
+
if batch_mode and cases is not None:
|
|
569
|
+
return await self._arun_batch(
|
|
570
|
+
cases=cases, run_id=run_id, print_summary=print_summary, print_results=print_results
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
# Validate single mode has both input and output
|
|
574
|
+
if input is None or output is None:
|
|
575
|
+
raise ValueError("Both input and output are required for single evaluation")
|
|
576
|
+
|
|
577
|
+
# Single evaluation logic
|
|
578
|
+
from rich.console import Console
|
|
579
|
+
from rich.live import Live
|
|
580
|
+
from rich.status import Status
|
|
581
|
+
|
|
582
|
+
set_log_level_to_debug() if self.debug_mode else set_log_level_to_info()
|
|
583
|
+
result = AgentAsJudgeResult(run_id=run_id)
|
|
584
|
+
|
|
585
|
+
console = Console()
|
|
586
|
+
with Live(console=console, transient=True) as live_log:
|
|
587
|
+
evaluator = self.get_evaluator_agent()
|
|
588
|
+
|
|
589
|
+
status = Status("Running evaluation...", spinner="dots", speed=1.0, refresh_per_second=10)
|
|
590
|
+
live_log.update(status)
|
|
591
|
+
|
|
592
|
+
evaluation = await self._aevaluate(input=input, output=output, evaluator_agent=evaluator)
|
|
593
|
+
|
|
594
|
+
if evaluation:
|
|
595
|
+
result.results.append(evaluation)
|
|
596
|
+
result.compute_stats()
|
|
597
|
+
|
|
598
|
+
status.stop()
|
|
599
|
+
|
|
600
|
+
# Save result to file
|
|
601
|
+
if self.file_path_to_save_results:
|
|
602
|
+
store_result_in_file(
|
|
603
|
+
file_path=self.file_path_to_save_results,
|
|
604
|
+
result=result,
|
|
605
|
+
eval_id=run_id,
|
|
606
|
+
name=self.name,
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
# Print results
|
|
610
|
+
if self.print_results or print_results:
|
|
611
|
+
result.print_results(console)
|
|
612
|
+
if self.print_summary or print_summary:
|
|
613
|
+
result.print_summary(console)
|
|
614
|
+
|
|
615
|
+
# Log to DB
|
|
616
|
+
await self._async_log_eval_to_db(run_id=run_id, result=result)
|
|
617
|
+
|
|
618
|
+
if self.telemetry:
|
|
619
|
+
from agno.api.evals import EvalRunCreate, async_create_eval_run_telemetry
|
|
620
|
+
|
|
621
|
+
await async_create_eval_run_telemetry(
|
|
622
|
+
eval_run=EvalRunCreate(
|
|
623
|
+
run_id=run_id, eval_type=EvalType.AGENT_AS_JUDGE, data=self._get_telemetry_data(result)
|
|
624
|
+
)
|
|
625
|
+
)
|
|
626
|
+
|
|
627
|
+
return result
|
|
628
|
+
|
|
629
|
+
def _run_batch(
|
|
630
|
+
self,
|
|
631
|
+
cases: List[Dict[str, str]],
|
|
632
|
+
run_id: str,
|
|
633
|
+
*,
|
|
634
|
+
print_summary: bool = True,
|
|
635
|
+
print_results: bool = False,
|
|
636
|
+
) -> Optional[AgentAsJudgeResult]:
|
|
637
|
+
"""Private helper: Evaluate multiple input/output pairs.
|
|
638
|
+
|
|
639
|
+
Args:
|
|
640
|
+
cases: List of dicts with 'input' and 'output' keys
|
|
641
|
+
run_id: Unique ID for this evaluation run
|
|
642
|
+
"""
|
|
643
|
+
from rich.console import Console
|
|
644
|
+
from rich.live import Live
|
|
645
|
+
from rich.status import Status
|
|
646
|
+
|
|
647
|
+
if isinstance(self.db, AsyncBaseDb):
|
|
648
|
+
raise ValueError("Use arun() with async DB.")
|
|
649
|
+
|
|
650
|
+
set_log_level_to_debug() if self.debug_mode else set_log_level_to_info()
|
|
651
|
+
result = AgentAsJudgeResult(run_id=run_id)
|
|
652
|
+
|
|
653
|
+
console = Console()
|
|
654
|
+
with Live(console=console, transient=True) as live_log:
|
|
655
|
+
evaluator = self.get_evaluator_agent()
|
|
656
|
+
|
|
657
|
+
for i, case in enumerate(cases):
|
|
658
|
+
status = Status(f"Evaluating {i + 1}/{len(cases)}...", spinner="dots")
|
|
659
|
+
live_log.update(status)
|
|
660
|
+
|
|
661
|
+
evaluation = self._evaluate(input=case["input"], output=case["output"], evaluator_agent=evaluator)
|
|
662
|
+
if evaluation:
|
|
663
|
+
result.results.append(evaluation)
|
|
664
|
+
result.compute_stats()
|
|
665
|
+
|
|
666
|
+
status.stop()
|
|
667
|
+
|
|
668
|
+
# Save result to file
|
|
669
|
+
if self.file_path_to_save_results:
|
|
670
|
+
store_result_in_file(
|
|
671
|
+
file_path=self.file_path_to_save_results,
|
|
672
|
+
result=result,
|
|
673
|
+
eval_id=run_id,
|
|
674
|
+
name=self.name,
|
|
675
|
+
)
|
|
676
|
+
|
|
677
|
+
# Print results
|
|
678
|
+
if self.print_results or print_results:
|
|
679
|
+
result.print_results(console)
|
|
680
|
+
if self.print_summary or print_summary:
|
|
681
|
+
result.print_summary(console)
|
|
682
|
+
|
|
683
|
+
# Log to DB
|
|
684
|
+
self._log_eval_to_db(run_id=run_id, result=result)
|
|
685
|
+
|
|
686
|
+
if self.telemetry:
|
|
687
|
+
from agno.api.evals import EvalRunCreate, create_eval_run_telemetry
|
|
688
|
+
|
|
689
|
+
create_eval_run_telemetry(
|
|
690
|
+
eval_run=EvalRunCreate(
|
|
691
|
+
run_id=run_id, eval_type=EvalType.AGENT_AS_JUDGE, data=self._get_telemetry_data(result)
|
|
692
|
+
)
|
|
693
|
+
)
|
|
694
|
+
|
|
695
|
+
return result
|
|
696
|
+
|
|
697
|
+
async def _arun_batch(
|
|
698
|
+
self,
|
|
699
|
+
cases: List[Dict[str, str]],
|
|
700
|
+
run_id: str,
|
|
701
|
+
*,
|
|
702
|
+
print_summary: bool = True,
|
|
703
|
+
print_results: bool = False,
|
|
704
|
+
) -> Optional[AgentAsJudgeResult]:
|
|
705
|
+
"""Private helper: Evaluate multiple input/output pairs asynchronously.
|
|
706
|
+
|
|
707
|
+
Args:
|
|
708
|
+
cases: List of dicts with 'input' and 'output' keys
|
|
709
|
+
run_id: Unique ID for this evaluation run
|
|
710
|
+
"""
|
|
711
|
+
from rich.console import Console
|
|
712
|
+
from rich.live import Live
|
|
713
|
+
from rich.status import Status
|
|
714
|
+
|
|
715
|
+
set_log_level_to_debug() if self.debug_mode else set_log_level_to_info()
|
|
716
|
+
result = AgentAsJudgeResult(run_id=run_id)
|
|
717
|
+
|
|
718
|
+
console = Console()
|
|
719
|
+
with Live(console=console, transient=True) as live_log:
|
|
720
|
+
evaluator = self.get_evaluator_agent()
|
|
721
|
+
|
|
722
|
+
for i, case in enumerate(cases):
|
|
723
|
+
status = Status(f"Evaluating {i + 1}/{len(cases)}...", spinner="dots")
|
|
724
|
+
live_log.update(status)
|
|
725
|
+
|
|
726
|
+
evaluation = await self._aevaluate(
|
|
727
|
+
input=case["input"], output=case["output"], evaluator_agent=evaluator
|
|
728
|
+
)
|
|
729
|
+
if evaluation:
|
|
730
|
+
result.results.append(evaluation)
|
|
731
|
+
result.compute_stats()
|
|
732
|
+
|
|
733
|
+
status.stop()
|
|
734
|
+
|
|
735
|
+
# Save result to file
|
|
736
|
+
if self.file_path_to_save_results:
|
|
737
|
+
store_result_in_file(
|
|
738
|
+
file_path=self.file_path_to_save_results,
|
|
739
|
+
result=result,
|
|
740
|
+
eval_id=run_id,
|
|
741
|
+
name=self.name,
|
|
742
|
+
)
|
|
743
|
+
|
|
744
|
+
# Print results
|
|
745
|
+
if self.print_results or print_results:
|
|
746
|
+
result.print_results(console)
|
|
747
|
+
if self.print_summary or print_summary:
|
|
748
|
+
result.print_summary(console)
|
|
749
|
+
|
|
750
|
+
# Log to DB
|
|
751
|
+
await self._async_log_eval_to_db(run_id=run_id, result=result)
|
|
752
|
+
|
|
753
|
+
if self.telemetry:
|
|
754
|
+
from agno.api.evals import EvalRunCreate, async_create_eval_run_telemetry
|
|
755
|
+
|
|
756
|
+
await async_create_eval_run_telemetry(
|
|
757
|
+
eval_run=EvalRunCreate(
|
|
758
|
+
run_id=run_id, eval_type=EvalType.AGENT_AS_JUDGE, data=self._get_telemetry_data(result)
|
|
759
|
+
)
|
|
760
|
+
)
|
|
761
|
+
|
|
762
|
+
return result
|
|
763
|
+
|
|
764
|
+
def _get_telemetry_data(self, result: Optional[AgentAsJudgeResult] = None) -> Dict[str, Any]:
|
|
765
|
+
return {
|
|
766
|
+
"criteria_length": len(self.criteria) if self.criteria else 0,
|
|
767
|
+
"scoring_strategy": self.scoring_strategy,
|
|
768
|
+
"threshold": self.threshold if self.scoring_strategy == "numeric" else None,
|
|
769
|
+
"num_results": len(result.results) if result else 0,
|
|
770
|
+
}
|
|
771
|
+
|
|
772
|
+
# BaseEval hook methods
|
|
773
|
+
def pre_check(self, run_input: Union[RunInput, TeamRunInput]) -> None:
|
|
774
|
+
raise ValueError("Pre-hooks are not supported")
|
|
775
|
+
|
|
776
|
+
async def async_pre_check(self, run_input: Union[RunInput, TeamRunInput]) -> None:
|
|
777
|
+
raise ValueError("Pre-hooks are not supported")
|
|
778
|
+
|
|
779
|
+
def post_check(self, run_output: Union[RunOutput, TeamRunOutput]) -> None:
|
|
780
|
+
"""Perform sync post-check to evaluate agent output."""
|
|
781
|
+
input_str = run_output.input.input_content_string() if run_output.input else ""
|
|
782
|
+
output_str = str(run_output.content) if run_output.content else ""
|
|
783
|
+
|
|
784
|
+
# Temporarily disable DB logging
|
|
785
|
+
original_db = self.db
|
|
786
|
+
self.db = None
|
|
787
|
+
|
|
788
|
+
# Run evaluation and capture result
|
|
789
|
+
result = self.run(
|
|
790
|
+
input=input_str, output=output_str, print_results=self.print_results, print_summary=self.print_summary
|
|
791
|
+
)
|
|
792
|
+
|
|
793
|
+
# Restore DB and log with context from run_output
|
|
794
|
+
self.db = original_db
|
|
795
|
+
|
|
796
|
+
if isinstance(self.db, AsyncBaseDb):
|
|
797
|
+
raise ValueError("post_check() requires sync DB. Use async_post_check() with async DB.")
|
|
798
|
+
|
|
799
|
+
# Extract metadata from run_output
|
|
800
|
+
if isinstance(run_output, RunOutput):
|
|
801
|
+
agent_id = run_output.agent_id
|
|
802
|
+
team_id = None
|
|
803
|
+
model_id = run_output.model
|
|
804
|
+
model_provider = run_output.model_provider
|
|
805
|
+
elif isinstance(run_output, TeamRunOutput):
|
|
806
|
+
agent_id = None
|
|
807
|
+
team_id = run_output.team_id
|
|
808
|
+
model_id = run_output.model
|
|
809
|
+
model_provider = run_output.model_provider
|
|
810
|
+
|
|
811
|
+
# Log to DB if we have a valid result (use run_id from result)
|
|
812
|
+
if result:
|
|
813
|
+
self._log_eval_to_db(
|
|
814
|
+
run_id=result.run_id,
|
|
815
|
+
result=result,
|
|
816
|
+
agent_id=agent_id,
|
|
817
|
+
model_id=model_id,
|
|
818
|
+
model_provider=model_provider,
|
|
819
|
+
team_id=team_id,
|
|
820
|
+
)
|
|
821
|
+
|
|
822
|
+
async def async_post_check(self, run_output: Union[RunOutput, TeamRunOutput]) -> None:
|
|
823
|
+
"""Perform async post-check to evaluate agent output."""
|
|
824
|
+
input_str = run_output.input.input_content_string() if run_output.input else ""
|
|
825
|
+
output_str = str(run_output.content) if run_output.content else ""
|
|
826
|
+
|
|
827
|
+
# Temporarily disable DB logging
|
|
828
|
+
original_db = self.db
|
|
829
|
+
self.db = None
|
|
830
|
+
|
|
831
|
+
# Run evaluation and capture result
|
|
832
|
+
result = await self.arun(
|
|
833
|
+
input=input_str, output=output_str, print_results=self.print_results, print_summary=self.print_summary
|
|
834
|
+
)
|
|
835
|
+
|
|
836
|
+
# Restore DB and log with context from run_output
|
|
837
|
+
self.db = original_db
|
|
838
|
+
|
|
839
|
+
# Extract metadata from run_output
|
|
840
|
+
if isinstance(run_output, RunOutput):
|
|
841
|
+
agent_id = run_output.agent_id
|
|
842
|
+
team_id = None
|
|
843
|
+
model_id = run_output.model
|
|
844
|
+
model_provider = run_output.model_provider
|
|
845
|
+
elif isinstance(run_output, TeamRunOutput):
|
|
846
|
+
agent_id = None
|
|
847
|
+
team_id = run_output.team_id
|
|
848
|
+
model_id = run_output.model
|
|
849
|
+
model_provider = run_output.model_provider
|
|
850
|
+
|
|
851
|
+
# Log to DB if we have a valid result (use run_id from result)
|
|
852
|
+
if result:
|
|
853
|
+
await self._async_log_eval_to_db(
|
|
854
|
+
run_id=result.run_id,
|
|
855
|
+
result=result,
|
|
856
|
+
agent_id=agent_id,
|
|
857
|
+
model_id=model_id,
|
|
858
|
+
model_provider=model_provider,
|
|
859
|
+
team_id=team_id,
|
|
860
|
+
)
|