agno 2.3.8__py3-none-any.whl → 2.3.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. agno/agent/agent.py +134 -82
  2. agno/db/mysql/__init__.py +2 -1
  3. agno/db/mysql/async_mysql.py +2888 -0
  4. agno/db/mysql/mysql.py +17 -8
  5. agno/db/mysql/utils.py +139 -6
  6. agno/db/postgres/async_postgres.py +10 -5
  7. agno/db/postgres/postgres.py +7 -2
  8. agno/db/schemas/evals.py +1 -0
  9. agno/db/singlestore/singlestore.py +5 -1
  10. agno/db/sqlite/async_sqlite.py +2 -2
  11. agno/eval/__init__.py +10 -0
  12. agno/eval/agent_as_judge.py +860 -0
  13. agno/eval/base.py +29 -0
  14. agno/eval/utils.py +2 -1
  15. agno/exceptions.py +7 -0
  16. agno/knowledge/embedder/openai.py +8 -8
  17. agno/knowledge/knowledge.py +1142 -176
  18. agno/media.py +22 -6
  19. agno/models/aws/claude.py +8 -7
  20. agno/models/base.py +27 -1
  21. agno/models/deepseek/deepseek.py +67 -0
  22. agno/models/google/gemini.py +65 -11
  23. agno/models/google/utils.py +22 -0
  24. agno/models/message.py +2 -0
  25. agno/models/openai/chat.py +4 -0
  26. agno/os/app.py +64 -74
  27. agno/os/interfaces/a2a/router.py +3 -4
  28. agno/os/interfaces/agui/router.py +2 -0
  29. agno/os/router.py +3 -1607
  30. agno/os/routers/agents/__init__.py +3 -0
  31. agno/os/routers/agents/router.py +581 -0
  32. agno/os/routers/agents/schema.py +261 -0
  33. agno/os/routers/evals/evals.py +26 -6
  34. agno/os/routers/evals/schemas.py +34 -2
  35. agno/os/routers/evals/utils.py +101 -20
  36. agno/os/routers/knowledge/knowledge.py +1 -1
  37. agno/os/routers/teams/__init__.py +3 -0
  38. agno/os/routers/teams/router.py +496 -0
  39. agno/os/routers/teams/schema.py +257 -0
  40. agno/os/routers/workflows/__init__.py +3 -0
  41. agno/os/routers/workflows/router.py +545 -0
  42. agno/os/routers/workflows/schema.py +75 -0
  43. agno/os/schema.py +1 -559
  44. agno/os/utils.py +139 -2
  45. agno/team/team.py +73 -16
  46. agno/tools/file_generation.py +12 -6
  47. agno/tools/firecrawl.py +15 -7
  48. agno/utils/hooks.py +64 -5
  49. agno/utils/http.py +2 -2
  50. agno/utils/media.py +11 -1
  51. agno/utils/print_response/agent.py +8 -0
  52. agno/utils/print_response/team.py +8 -0
  53. agno/vectordb/pgvector/pgvector.py +88 -51
  54. agno/workflow/parallel.py +3 -3
  55. agno/workflow/step.py +14 -2
  56. agno/workflow/types.py +38 -2
  57. agno/workflow/workflow.py +12 -4
  58. {agno-2.3.8.dist-info → agno-2.3.9.dist-info}/METADATA +7 -2
  59. {agno-2.3.8.dist-info → agno-2.3.9.dist-info}/RECORD +62 -49
  60. {agno-2.3.8.dist-info → agno-2.3.9.dist-info}/WHEEL +0 -0
  61. {agno-2.3.8.dist-info → agno-2.3.9.dist-info}/licenses/LICENSE +0 -0
  62. {agno-2.3.8.dist-info → agno-2.3.9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,860 @@
1
+ from dataclasses import asdict, dataclass, field
2
+ from inspect import iscoroutinefunction
3
+ from os import getenv
4
+ from textwrap import dedent
5
+ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Union
6
+ from uuid import uuid4
7
+
8
+ from pydantic import BaseModel, Field
9
+
10
+ from agno.agent import Agent
11
+ from agno.db.base import AsyncBaseDb, BaseDb
12
+ from agno.db.schemas.evals import EvalType
13
+ from agno.eval.base import BaseEval
14
+ from agno.eval.utils import async_log_eval, log_eval_run, store_result_in_file
15
+ from agno.exceptions import EvalError
16
+ from agno.models.base import Model
17
+ from agno.run.agent import RunInput, RunOutput
18
+ from agno.run.team import TeamRunInput, TeamRunOutput
19
+ from agno.utils.log import log_warning, logger, set_log_level_to_debug, set_log_level_to_info
20
+
21
+ if TYPE_CHECKING:
22
+ from rich.console import Console
23
+
24
+
25
+ class NumericJudgeResponse(BaseModel):
26
+ """Response schema for numeric scoring mode."""
27
+
28
+ score: int = Field(..., ge=1, le=10, description="Score between 1 and 10.")
29
+ reason: str = Field(..., description="Detailed reasoning for the evaluation.")
30
+
31
+
32
+ class BinaryJudgeResponse(BaseModel):
33
+ """Response schema for binary scoring mode."""
34
+
35
+ passed: bool = Field(..., description="Pass/fail result.")
36
+ reason: str = Field(..., description="Detailed reasoning for the evaluation.")
37
+
38
+
39
+ @dataclass
40
+ class AgentAsJudgeEvaluation:
41
+ """Result of a single agent-as-judge evaluation."""
42
+
43
+ input: str
44
+ output: str
45
+ criteria: str
46
+ score: Optional[int]
47
+ reason: str
48
+ passed: bool
49
+
50
+ def print_eval(self, console: Optional["Console"] = None):
51
+ from rich.box import ROUNDED
52
+ from rich.console import Console
53
+ from rich.markdown import Markdown
54
+ from rich.table import Table
55
+
56
+ if console is None:
57
+ console = Console()
58
+
59
+ status_style = "green" if self.passed else "red"
60
+ status_text = "PASSED" if self.passed else "FAILED"
61
+
62
+ results_table = Table(
63
+ box=ROUNDED,
64
+ border_style="blue",
65
+ show_header=False,
66
+ title="[ Agent As Judge Evaluation ]",
67
+ title_style="bold sky_blue1",
68
+ title_justify="center",
69
+ )
70
+ results_table.add_row("Input", self.input[:200] + "..." if len(self.input) > 200 else self.input)
71
+ results_table.add_row("Output", self.output[:200] + "..." if len(self.output) > 200 else self.output)
72
+ if self.score is not None:
73
+ results_table.add_row("Score", f"{self.score}/10")
74
+ results_table.add_row("Status", f"[{status_style}]{status_text}[/{status_style}]")
75
+ results_table.add_row("Reason", Markdown(self.reason))
76
+ console.print(results_table)
77
+
78
+
79
+ @dataclass
80
+ class AgentAsJudgeResult:
81
+ """Aggregated results from agent-as-judge evaluations."""
82
+
83
+ run_id: str
84
+ results: List[AgentAsJudgeEvaluation] = field(default_factory=list)
85
+ avg_score: Optional[float] = field(init=False)
86
+ min_score: Optional[float] = field(init=False)
87
+ max_score: Optional[float] = field(init=False)
88
+ std_dev_score: Optional[float] = field(init=False)
89
+ pass_rate: float = field(init=False)
90
+
91
+ def __post_init__(self):
92
+ self.compute_stats()
93
+
94
+ def compute_stats(self):
95
+ import statistics
96
+
97
+ if self.results and len(self.results) > 0:
98
+ passed = [r.passed for r in self.results]
99
+ self.pass_rate = sum(passed) / len(passed) * 100
100
+
101
+ # Compute score statistics only for numeric mode (where score is not None)
102
+ scores = [r.score for r in self.results if r.score is not None]
103
+ if scores:
104
+ self.avg_score = statistics.mean(scores)
105
+ self.min_score = min(scores)
106
+ self.max_score = max(scores)
107
+ self.std_dev_score = statistics.stdev(scores) if len(scores) > 1 else 0.0
108
+ else:
109
+ # Binary mode - no scores
110
+ self.avg_score = None
111
+ self.min_score = None
112
+ self.max_score = None
113
+ self.std_dev_score = None
114
+ else:
115
+ self.avg_score = None
116
+ self.min_score = None
117
+ self.max_score = None
118
+ self.std_dev_score = None
119
+ self.pass_rate = 0.0
120
+
121
+ def print_summary(self, console: Optional["Console"] = None):
122
+ from rich.box import ROUNDED
123
+ from rich.console import Console
124
+ from rich.table import Table
125
+
126
+ if console is None:
127
+ console = Console()
128
+
129
+ summary_table = Table(
130
+ box=ROUNDED,
131
+ border_style="blue",
132
+ show_header=False,
133
+ title="[ Agent As Judge Evaluation Summary ]",
134
+ title_style="bold sky_blue1",
135
+ title_justify="center",
136
+ padding=(0, 2), # Add horizontal padding to make table wider
137
+ min_width=45, # Ensure table is wide enough for title
138
+ )
139
+
140
+ num_results = len(self.results)
141
+ summary_table.add_row("Number of Evaluations", f"{num_results}")
142
+ summary_table.add_row("Pass Rate", f"{self.pass_rate:.1f}%")
143
+
144
+ # Only show score statistics for numeric mode (when scores exist)
145
+ if self.avg_score is not None:
146
+ # For single evaluation, show "Score" instead of statistics
147
+ if num_results == 1:
148
+ summary_table.add_row("Score", f"{self.avg_score:.2f}/10")
149
+ # For multiple evaluations, show full statistics
150
+ elif num_results > 1:
151
+ summary_table.add_row("Average Score", f"{self.avg_score:.2f}/10")
152
+ summary_table.add_row("Min Score", f"{self.min_score:.2f}/10")
153
+ summary_table.add_row("Max Score", f"{self.max_score:.2f}/10")
154
+ if self.std_dev_score and self.std_dev_score > 0:
155
+ summary_table.add_row("Std Deviation", f"{self.std_dev_score:.2f}")
156
+
157
+ console.print(summary_table)
158
+
159
+ def print_results(self, console: Optional["Console"] = None):
160
+ for result in self.results:
161
+ result.print_eval(console)
162
+
163
+
164
+ @dataclass
165
+ class AgentAsJudgeEval(BaseEval):
166
+ """Evaluate agent outputs using custom criteria with an LLM judge."""
167
+
168
+ # Core evaluation fields
169
+ criteria: str = ""
170
+ scoring_strategy: Literal["numeric", "binary"] = "binary"
171
+ threshold: int = 7 # Only used for numeric strategy
172
+ on_fail: Optional[Callable[["AgentAsJudgeEvaluation"], None]] = None
173
+ additional_guidelines: Optional[Union[str, List[str]]] = None
174
+
175
+ # Evaluation metadata
176
+ name: Optional[str] = None
177
+
178
+ # Model configuration
179
+ model: Optional[Model] = None
180
+ evaluator_agent: Optional[Agent] = None
181
+
182
+ # Output options
183
+ print_summary: bool = False
184
+ print_results: bool = False
185
+ file_path_to_save_results: Optional[str] = None
186
+ debug_mode: bool = getenv("AGNO_DEBUG", "false").lower() == "true"
187
+ db: Optional[Union[BaseDb, AsyncBaseDb]] = None
188
+ telemetry: bool = True
189
+ run_in_background: bool = False
190
+
191
+ def __post_init__(self):
192
+ """Validate scoring_strategy and threshold."""
193
+ if self.scoring_strategy == "numeric" and not 1 <= self.threshold <= 10:
194
+ raise ValueError(f"threshold must be between 1 and 10, got {self.threshold}")
195
+
196
+ def get_evaluator_agent(self) -> Agent:
197
+ """Return the evaluator agent. If not provided, build it based on the model and criteria."""
198
+ # Select response schema based on scoring strategy
199
+ response_schema = NumericJudgeResponse if self.scoring_strategy == "numeric" else BinaryJudgeResponse
200
+
201
+ if self.evaluator_agent is not None:
202
+ # Ensure custom evaluator has the required output_schema for structured responses
203
+ self.evaluator_agent.output_schema = response_schema
204
+ return self.evaluator_agent
205
+
206
+ model = self.model
207
+ if model is None:
208
+ try:
209
+ from agno.models.openai import OpenAIChat
210
+
211
+ model = OpenAIChat(id="gpt-5-mini")
212
+ except (ModuleNotFoundError, ImportError) as e:
213
+ logger.exception(e)
214
+ raise EvalError(
215
+ "Agno uses `openai` as the default model provider. Please run `pip install openai` to use the default evaluator."
216
+ )
217
+
218
+ # Build instructions based on scoring strategy
219
+ instructions_parts = ["## Criteria", self.criteria, ""]
220
+
221
+ if self.scoring_strategy == "numeric":
222
+ instructions_parts.extend(
223
+ [
224
+ "## Scoring (1-10)",
225
+ "- 1-2: Completely fails the criteria",
226
+ "- 3-4: Major issues",
227
+ "- 5-6: Partial success with significant issues",
228
+ "- 7-8: Mostly meets criteria with minor issues",
229
+ "- 9-10: Fully meets or exceeds criteria",
230
+ "",
231
+ "## Instructions",
232
+ "1. Carefully evaluate the output against the criteria above",
233
+ "2. Provide a score from 1-10",
234
+ "3. Provide detailed reasoning that references specific parts of the output",
235
+ ]
236
+ )
237
+ else: # binary
238
+ instructions_parts.extend(
239
+ [
240
+ "## Evaluation",
241
+ "Determine if the output PASSES or FAILS the criteria above.",
242
+ "",
243
+ "## Instructions",
244
+ "1. Carefully evaluate the output against the criteria above",
245
+ "2. Decide if it passes (true) or fails (false)",
246
+ "3. Provide detailed reasoning that references specific parts of the output",
247
+ ]
248
+ )
249
+
250
+ # Add additional guidelines if provided
251
+ if self.additional_guidelines:
252
+ instructions_parts.append("")
253
+ instructions_parts.append("## Additional Guidelines")
254
+ if isinstance(self.additional_guidelines, str):
255
+ instructions_parts.append(self.additional_guidelines)
256
+ else:
257
+ for guideline in self.additional_guidelines:
258
+ instructions_parts.append(f"- {guideline}")
259
+
260
+ # Add closing instruction
261
+ instructions_parts.append("")
262
+ instructions_parts.append("Be objective and thorough in your evaluation.")
263
+
264
+ return Agent(
265
+ model=model,
266
+ description="You are an expert evaluator. Score outputs objectively based on the provided criteria.",
267
+ instructions="\n".join(instructions_parts),
268
+ output_schema=response_schema,
269
+ )
270
+
271
+ def _evaluate(self, input: str, output: str, evaluator_agent: Agent) -> Optional[AgentAsJudgeEvaluation]:
272
+ """Evaluate a single input/output pair."""
273
+ try:
274
+ prompt = dedent(f"""\
275
+ <input>
276
+ {input}
277
+ </input>
278
+
279
+ <output>
280
+ {output}
281
+ </output>
282
+ """)
283
+
284
+ response = evaluator_agent.run(prompt).content
285
+ if not isinstance(response, (NumericJudgeResponse, BinaryJudgeResponse)):
286
+ raise EvalError(f"Invalid response: {response}")
287
+
288
+ # Determine pass/fail based on scoring strategy and response type
289
+ if isinstance(response, NumericJudgeResponse):
290
+ score = response.score
291
+ passed = score >= self.threshold
292
+ else: # BinaryJudgeResponse
293
+ score = None
294
+ passed = response.passed
295
+
296
+ evaluation = AgentAsJudgeEvaluation(
297
+ input=input,
298
+ output=output,
299
+ criteria=self.criteria,
300
+ score=score,
301
+ reason=response.reason,
302
+ passed=passed,
303
+ )
304
+
305
+ # Trigger on_fail callback if evaluation failed
306
+ if not passed and self.on_fail:
307
+ try:
308
+ if iscoroutinefunction(self.on_fail):
309
+ log_warning(
310
+ f"Cannot use async on_fail callback with sync evaluation. Use arun() instead. Skipping callback: {self.on_fail.__name__}"
311
+ )
312
+ else:
313
+ self.on_fail(evaluation)
314
+ except Exception as e:
315
+ logger.warning(f"on_fail callback error: {e}")
316
+
317
+ return evaluation
318
+ except Exception as e:
319
+ logger.exception(f"Evaluation failed: {e}")
320
+ return None
321
+
322
+ async def _aevaluate(self, input: str, output: str, evaluator_agent: Agent) -> Optional[AgentAsJudgeEvaluation]:
323
+ """Evaluate a single input/output pair asynchronously."""
324
+ try:
325
+ prompt = dedent(f"""\
326
+ <input>
327
+ {input}
328
+ </input>
329
+
330
+ <output>
331
+ {output}
332
+ </output>
333
+ """)
334
+
335
+ response = await evaluator_agent.arun(prompt)
336
+ judge_response = response.content
337
+ if not isinstance(judge_response, (NumericJudgeResponse, BinaryJudgeResponse)):
338
+ raise EvalError(f"Invalid response: {judge_response}")
339
+
340
+ # Determine pass/fail based on response type
341
+ if isinstance(judge_response, NumericJudgeResponse):
342
+ score = judge_response.score
343
+ passed = score >= self.threshold
344
+ else: # BinaryJudgeResponse
345
+ score = None
346
+ passed = judge_response.passed
347
+
348
+ evaluation = AgentAsJudgeEvaluation(
349
+ input=input,
350
+ output=output,
351
+ criteria=self.criteria,
352
+ score=score,
353
+ reason=judge_response.reason,
354
+ passed=passed,
355
+ )
356
+
357
+ # Trigger on_fail callback if evaluation failed
358
+ if not passed and self.on_fail:
359
+ try:
360
+ if iscoroutinefunction(self.on_fail):
361
+ await self.on_fail(evaluation)
362
+ else:
363
+ self.on_fail(evaluation)
364
+ except Exception as e:
365
+ logger.warning(f"on_fail callback error: {e}")
366
+
367
+ return evaluation
368
+ except Exception as e:
369
+ logger.exception(f"Async evaluation failed: {e}")
370
+ return None
371
+
372
+ def _log_eval_to_db(
373
+ self,
374
+ run_id: str,
375
+ result: AgentAsJudgeResult,
376
+ agent_id: Optional[str] = None,
377
+ model_id: Optional[str] = None,
378
+ model_provider: Optional[str] = None,
379
+ team_id: Optional[str] = None,
380
+ evaluated_component_name: Optional[str] = None,
381
+ ) -> None:
382
+ """Helper to log evaluation to database."""
383
+ if not self.db:
384
+ return
385
+
386
+ log_eval_run(
387
+ db=self.db, # type: ignore
388
+ run_id=run_id,
389
+ run_data=asdict(result),
390
+ eval_type=EvalType.AGENT_AS_JUDGE,
391
+ agent_id=agent_id,
392
+ model_id=model_id,
393
+ model_provider=model_provider,
394
+ name=self.name,
395
+ team_id=team_id,
396
+ evaluated_component_name=evaluated_component_name,
397
+ eval_input={
398
+ "criteria": self.criteria,
399
+ "scoring_strategy": self.scoring_strategy,
400
+ "threshold": self.threshold if self.scoring_strategy == "numeric" else None,
401
+ "additional_guidelines": self.additional_guidelines,
402
+ },
403
+ )
404
+
405
+ async def _async_log_eval_to_db(
406
+ self,
407
+ run_id: str,
408
+ result: AgentAsJudgeResult,
409
+ agent_id: Optional[str] = None,
410
+ model_id: Optional[str] = None,
411
+ model_provider: Optional[str] = None,
412
+ team_id: Optional[str] = None,
413
+ evaluated_component_name: Optional[str] = None,
414
+ ) -> None:
415
+ """Helper to log evaluation to database asynchronously."""
416
+ if not self.db:
417
+ return
418
+
419
+ await async_log_eval(
420
+ db=self.db,
421
+ run_id=run_id,
422
+ run_data=asdict(result),
423
+ eval_type=EvalType.AGENT_AS_JUDGE,
424
+ agent_id=agent_id,
425
+ model_id=model_id,
426
+ model_provider=model_provider,
427
+ name=self.name,
428
+ team_id=team_id,
429
+ evaluated_component_name=evaluated_component_name,
430
+ eval_input={
431
+ "criteria": self.criteria,
432
+ "scoring_strategy": self.scoring_strategy,
433
+ "threshold": self.threshold if self.scoring_strategy == "numeric" else None,
434
+ "additional_guidelines": self.additional_guidelines,
435
+ },
436
+ )
437
+
438
+ def run(
439
+ self,
440
+ *,
441
+ input: Optional[str] = None,
442
+ output: Optional[str] = None,
443
+ cases: Optional[List[Dict[str, str]]] = None,
444
+ print_summary: bool = False,
445
+ print_results: bool = False,
446
+ ) -> Optional[AgentAsJudgeResult]:
447
+ """Evaluate input/output against the criteria.
448
+
449
+ Supports both single evaluation and batch evaluation:
450
+
451
+ Args:
452
+ input: Input text for single evaluation
453
+ output: Output text for single evaluation
454
+ cases: List of input/output pairs for batch evaluation
455
+ print_summary: Whether to print summary
456
+ print_results: Whether to print detailed results
457
+ """
458
+ # Generate unique run_id for this execution
459
+ run_id = str(uuid4())
460
+
461
+ # Validate parameters
462
+ single_mode = input is not None or output is not None
463
+ batch_mode = cases is not None
464
+
465
+ if single_mode and batch_mode:
466
+ raise ValueError("Provide either (input, output) OR cases, not both")
467
+
468
+ if not single_mode and not batch_mode:
469
+ raise ValueError("Must provide either (input, output) OR cases")
470
+
471
+ # Batch mode if cases provided
472
+ if batch_mode and cases is not None:
473
+ return self._run_batch(cases=cases, run_id=run_id, print_summary=print_summary, print_results=print_results)
474
+
475
+ # Validate single mode has both input and output
476
+ if input is None or output is None:
477
+ raise ValueError("Both input and output are required for single evaluation")
478
+
479
+ # Single evaluation logic
480
+ from rich.console import Console
481
+ from rich.live import Live
482
+ from rich.status import Status
483
+
484
+ if isinstance(self.db, AsyncBaseDb):
485
+ raise ValueError("Use arun() with async DB.")
486
+
487
+ set_log_level_to_debug() if self.debug_mode else set_log_level_to_info()
488
+ result = AgentAsJudgeResult(run_id=run_id)
489
+
490
+ console = Console()
491
+ with Live(console=console, transient=True) as live_log:
492
+ evaluator = self.get_evaluator_agent()
493
+
494
+ status = Status("Running evaluation...", spinner="dots", speed=1.0, refresh_per_second=10)
495
+ live_log.update(status)
496
+
497
+ evaluation = self._evaluate(input=input, output=output, evaluator_agent=evaluator)
498
+
499
+ if evaluation:
500
+ result.results.append(evaluation)
501
+ result.compute_stats()
502
+
503
+ status.stop()
504
+
505
+ # Save result to file
506
+ if self.file_path_to_save_results:
507
+ store_result_in_file(
508
+ file_path=self.file_path_to_save_results,
509
+ result=result,
510
+ eval_id=run_id,
511
+ name=self.name,
512
+ )
513
+
514
+ # Print results
515
+ if self.print_results or print_results:
516
+ result.print_results(console)
517
+ if self.print_summary or print_summary:
518
+ result.print_summary(console)
519
+
520
+ # Log to DB
521
+ self._log_eval_to_db(run_id=run_id, result=result)
522
+
523
+ if self.telemetry:
524
+ from agno.api.evals import EvalRunCreate, create_eval_run_telemetry
525
+
526
+ create_eval_run_telemetry(
527
+ eval_run=EvalRunCreate(
528
+ run_id=run_id, eval_type=EvalType.AGENT_AS_JUDGE, data=self._get_telemetry_data(result)
529
+ )
530
+ )
531
+
532
+ return result
533
+
534
+ async def arun(
535
+ self,
536
+ *,
537
+ input: Optional[str] = None,
538
+ output: Optional[str] = None,
539
+ cases: Optional[List[Dict[str, str]]] = None,
540
+ print_summary: bool = False,
541
+ print_results: bool = False,
542
+ ) -> Optional[AgentAsJudgeResult]:
543
+ """Evaluate input/output against the criteria asynchronously.
544
+
545
+ Supports both single evaluation and batch evaluation:
546
+
547
+ Args:
548
+ input: Input text for single evaluation
549
+ output: Output text for single evaluation
550
+ cases: List of input/output pairs for batch evaluation
551
+ print_summary: Whether to print summary
552
+ print_results: Whether to print detailed results
553
+ """
554
+ # Generate unique run_id for this execution
555
+ run_id = str(uuid4())
556
+
557
+ # Validate parameters
558
+ single_mode = input is not None or output is not None
559
+ batch_mode = cases is not None
560
+
561
+ if single_mode and batch_mode:
562
+ raise ValueError("Provide either (input, output) OR cases, not both")
563
+
564
+ if not single_mode and not batch_mode:
565
+ raise ValueError("Must provide either (input, output) OR cases")
566
+
567
+ # Batch mode if cases provided
568
+ if batch_mode and cases is not None:
569
+ return await self._arun_batch(
570
+ cases=cases, run_id=run_id, print_summary=print_summary, print_results=print_results
571
+ )
572
+
573
+ # Validate single mode has both input and output
574
+ if input is None or output is None:
575
+ raise ValueError("Both input and output are required for single evaluation")
576
+
577
+ # Single evaluation logic
578
+ from rich.console import Console
579
+ from rich.live import Live
580
+ from rich.status import Status
581
+
582
+ set_log_level_to_debug() if self.debug_mode else set_log_level_to_info()
583
+ result = AgentAsJudgeResult(run_id=run_id)
584
+
585
+ console = Console()
586
+ with Live(console=console, transient=True) as live_log:
587
+ evaluator = self.get_evaluator_agent()
588
+
589
+ status = Status("Running evaluation...", spinner="dots", speed=1.0, refresh_per_second=10)
590
+ live_log.update(status)
591
+
592
+ evaluation = await self._aevaluate(input=input, output=output, evaluator_agent=evaluator)
593
+
594
+ if evaluation:
595
+ result.results.append(evaluation)
596
+ result.compute_stats()
597
+
598
+ status.stop()
599
+
600
+ # Save result to file
601
+ if self.file_path_to_save_results:
602
+ store_result_in_file(
603
+ file_path=self.file_path_to_save_results,
604
+ result=result,
605
+ eval_id=run_id,
606
+ name=self.name,
607
+ )
608
+
609
+ # Print results
610
+ if self.print_results or print_results:
611
+ result.print_results(console)
612
+ if self.print_summary or print_summary:
613
+ result.print_summary(console)
614
+
615
+ # Log to DB
616
+ await self._async_log_eval_to_db(run_id=run_id, result=result)
617
+
618
+ if self.telemetry:
619
+ from agno.api.evals import EvalRunCreate, async_create_eval_run_telemetry
620
+
621
+ await async_create_eval_run_telemetry(
622
+ eval_run=EvalRunCreate(
623
+ run_id=run_id, eval_type=EvalType.AGENT_AS_JUDGE, data=self._get_telemetry_data(result)
624
+ )
625
+ )
626
+
627
+ return result
628
+
629
+ def _run_batch(
630
+ self,
631
+ cases: List[Dict[str, str]],
632
+ run_id: str,
633
+ *,
634
+ print_summary: bool = True,
635
+ print_results: bool = False,
636
+ ) -> Optional[AgentAsJudgeResult]:
637
+ """Private helper: Evaluate multiple input/output pairs.
638
+
639
+ Args:
640
+ cases: List of dicts with 'input' and 'output' keys
641
+ run_id: Unique ID for this evaluation run
642
+ """
643
+ from rich.console import Console
644
+ from rich.live import Live
645
+ from rich.status import Status
646
+
647
+ if isinstance(self.db, AsyncBaseDb):
648
+ raise ValueError("Use arun() with async DB.")
649
+
650
+ set_log_level_to_debug() if self.debug_mode else set_log_level_to_info()
651
+ result = AgentAsJudgeResult(run_id=run_id)
652
+
653
+ console = Console()
654
+ with Live(console=console, transient=True) as live_log:
655
+ evaluator = self.get_evaluator_agent()
656
+
657
+ for i, case in enumerate(cases):
658
+ status = Status(f"Evaluating {i + 1}/{len(cases)}...", spinner="dots")
659
+ live_log.update(status)
660
+
661
+ evaluation = self._evaluate(input=case["input"], output=case["output"], evaluator_agent=evaluator)
662
+ if evaluation:
663
+ result.results.append(evaluation)
664
+ result.compute_stats()
665
+
666
+ status.stop()
667
+
668
+ # Save result to file
669
+ if self.file_path_to_save_results:
670
+ store_result_in_file(
671
+ file_path=self.file_path_to_save_results,
672
+ result=result,
673
+ eval_id=run_id,
674
+ name=self.name,
675
+ )
676
+
677
+ # Print results
678
+ if self.print_results or print_results:
679
+ result.print_results(console)
680
+ if self.print_summary or print_summary:
681
+ result.print_summary(console)
682
+
683
+ # Log to DB
684
+ self._log_eval_to_db(run_id=run_id, result=result)
685
+
686
+ if self.telemetry:
687
+ from agno.api.evals import EvalRunCreate, create_eval_run_telemetry
688
+
689
+ create_eval_run_telemetry(
690
+ eval_run=EvalRunCreate(
691
+ run_id=run_id, eval_type=EvalType.AGENT_AS_JUDGE, data=self._get_telemetry_data(result)
692
+ )
693
+ )
694
+
695
+ return result
696
+
697
+ async def _arun_batch(
698
+ self,
699
+ cases: List[Dict[str, str]],
700
+ run_id: str,
701
+ *,
702
+ print_summary: bool = True,
703
+ print_results: bool = False,
704
+ ) -> Optional[AgentAsJudgeResult]:
705
+ """Private helper: Evaluate multiple input/output pairs asynchronously.
706
+
707
+ Args:
708
+ cases: List of dicts with 'input' and 'output' keys
709
+ run_id: Unique ID for this evaluation run
710
+ """
711
+ from rich.console import Console
712
+ from rich.live import Live
713
+ from rich.status import Status
714
+
715
+ set_log_level_to_debug() if self.debug_mode else set_log_level_to_info()
716
+ result = AgentAsJudgeResult(run_id=run_id)
717
+
718
+ console = Console()
719
+ with Live(console=console, transient=True) as live_log:
720
+ evaluator = self.get_evaluator_agent()
721
+
722
+ for i, case in enumerate(cases):
723
+ status = Status(f"Evaluating {i + 1}/{len(cases)}...", spinner="dots")
724
+ live_log.update(status)
725
+
726
+ evaluation = await self._aevaluate(
727
+ input=case["input"], output=case["output"], evaluator_agent=evaluator
728
+ )
729
+ if evaluation:
730
+ result.results.append(evaluation)
731
+ result.compute_stats()
732
+
733
+ status.stop()
734
+
735
+ # Save result to file
736
+ if self.file_path_to_save_results:
737
+ store_result_in_file(
738
+ file_path=self.file_path_to_save_results,
739
+ result=result,
740
+ eval_id=run_id,
741
+ name=self.name,
742
+ )
743
+
744
+ # Print results
745
+ if self.print_results or print_results:
746
+ result.print_results(console)
747
+ if self.print_summary or print_summary:
748
+ result.print_summary(console)
749
+
750
+ # Log to DB
751
+ await self._async_log_eval_to_db(run_id=run_id, result=result)
752
+
753
+ if self.telemetry:
754
+ from agno.api.evals import EvalRunCreate, async_create_eval_run_telemetry
755
+
756
+ await async_create_eval_run_telemetry(
757
+ eval_run=EvalRunCreate(
758
+ run_id=run_id, eval_type=EvalType.AGENT_AS_JUDGE, data=self._get_telemetry_data(result)
759
+ )
760
+ )
761
+
762
+ return result
763
+
764
+ def _get_telemetry_data(self, result: Optional[AgentAsJudgeResult] = None) -> Dict[str, Any]:
765
+ return {
766
+ "criteria_length": len(self.criteria) if self.criteria else 0,
767
+ "scoring_strategy": self.scoring_strategy,
768
+ "threshold": self.threshold if self.scoring_strategy == "numeric" else None,
769
+ "num_results": len(result.results) if result else 0,
770
+ }
771
+
772
+ # BaseEval hook methods
773
+ def pre_check(self, run_input: Union[RunInput, TeamRunInput]) -> None:
774
+ raise ValueError("Pre-hooks are not supported")
775
+
776
+ async def async_pre_check(self, run_input: Union[RunInput, TeamRunInput]) -> None:
777
+ raise ValueError("Pre-hooks are not supported")
778
+
779
+ def post_check(self, run_output: Union[RunOutput, TeamRunOutput]) -> None:
780
+ """Perform sync post-check to evaluate agent output."""
781
+ input_str = run_output.input.input_content_string() if run_output.input else ""
782
+ output_str = str(run_output.content) if run_output.content else ""
783
+
784
+ # Temporarily disable DB logging
785
+ original_db = self.db
786
+ self.db = None
787
+
788
+ # Run evaluation and capture result
789
+ result = self.run(
790
+ input=input_str, output=output_str, print_results=self.print_results, print_summary=self.print_summary
791
+ )
792
+
793
+ # Restore DB and log with context from run_output
794
+ self.db = original_db
795
+
796
+ if isinstance(self.db, AsyncBaseDb):
797
+ raise ValueError("post_check() requires sync DB. Use async_post_check() with async DB.")
798
+
799
+ # Extract metadata from run_output
800
+ if isinstance(run_output, RunOutput):
801
+ agent_id = run_output.agent_id
802
+ team_id = None
803
+ model_id = run_output.model
804
+ model_provider = run_output.model_provider
805
+ elif isinstance(run_output, TeamRunOutput):
806
+ agent_id = None
807
+ team_id = run_output.team_id
808
+ model_id = run_output.model
809
+ model_provider = run_output.model_provider
810
+
811
+ # Log to DB if we have a valid result (use run_id from result)
812
+ if result:
813
+ self._log_eval_to_db(
814
+ run_id=result.run_id,
815
+ result=result,
816
+ agent_id=agent_id,
817
+ model_id=model_id,
818
+ model_provider=model_provider,
819
+ team_id=team_id,
820
+ )
821
+
822
+ async def async_post_check(self, run_output: Union[RunOutput, TeamRunOutput]) -> None:
823
+ """Perform async post-check to evaluate agent output."""
824
+ input_str = run_output.input.input_content_string() if run_output.input else ""
825
+ output_str = str(run_output.content) if run_output.content else ""
826
+
827
+ # Temporarily disable DB logging
828
+ original_db = self.db
829
+ self.db = None
830
+
831
+ # Run evaluation and capture result
832
+ result = await self.arun(
833
+ input=input_str, output=output_str, print_results=self.print_results, print_summary=self.print_summary
834
+ )
835
+
836
+ # Restore DB and log with context from run_output
837
+ self.db = original_db
838
+
839
+ # Extract metadata from run_output
840
+ if isinstance(run_output, RunOutput):
841
+ agent_id = run_output.agent_id
842
+ team_id = None
843
+ model_id = run_output.model
844
+ model_provider = run_output.model_provider
845
+ elif isinstance(run_output, TeamRunOutput):
846
+ agent_id = None
847
+ team_id = run_output.team_id
848
+ model_id = run_output.model
849
+ model_provider = run_output.model_provider
850
+
851
+ # Log to DB if we have a valid result (use run_id from result)
852
+ if result:
853
+ await self._async_log_eval_to_db(
854
+ run_id=result.run_id,
855
+ result=result,
856
+ agent_id=agent_id,
857
+ model_id=model_id,
858
+ model_provider=model_provider,
859
+ team_id=team_id,
860
+ )