agno 2.3.8__py3-none-any.whl → 2.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/agent/agent.py +134 -94
- agno/db/mysql/__init__.py +2 -1
- agno/db/mysql/async_mysql.py +2888 -0
- agno/db/mysql/mysql.py +17 -8
- agno/db/mysql/utils.py +139 -6
- agno/db/postgres/async_postgres.py +10 -5
- agno/db/postgres/postgres.py +7 -2
- agno/db/schemas/evals.py +1 -0
- agno/db/singlestore/singlestore.py +5 -1
- agno/db/sqlite/async_sqlite.py +3 -3
- agno/eval/__init__.py +10 -0
- agno/eval/accuracy.py +11 -8
- agno/eval/agent_as_judge.py +861 -0
- agno/eval/base.py +29 -0
- agno/eval/utils.py +2 -1
- agno/exceptions.py +7 -0
- agno/knowledge/embedder/openai.py +8 -8
- agno/knowledge/knowledge.py +1142 -176
- agno/media.py +22 -6
- agno/models/aws/claude.py +8 -7
- agno/models/base.py +61 -2
- agno/models/deepseek/deepseek.py +67 -0
- agno/models/google/gemini.py +134 -51
- agno/models/google/utils.py +22 -0
- agno/models/message.py +5 -0
- agno/models/openai/chat.py +4 -0
- agno/os/app.py +64 -74
- agno/os/interfaces/a2a/router.py +3 -4
- agno/os/interfaces/agui/router.py +2 -0
- agno/os/router.py +3 -1607
- agno/os/routers/agents/__init__.py +3 -0
- agno/os/routers/agents/router.py +581 -0
- agno/os/routers/agents/schema.py +261 -0
- agno/os/routers/evals/evals.py +26 -6
- agno/os/routers/evals/schemas.py +34 -2
- agno/os/routers/evals/utils.py +77 -18
- agno/os/routers/knowledge/knowledge.py +1 -1
- agno/os/routers/teams/__init__.py +3 -0
- agno/os/routers/teams/router.py +496 -0
- agno/os/routers/teams/schema.py +257 -0
- agno/os/routers/workflows/__init__.py +3 -0
- agno/os/routers/workflows/router.py +545 -0
- agno/os/routers/workflows/schema.py +75 -0
- agno/os/schema.py +1 -559
- agno/os/utils.py +139 -2
- agno/team/team.py +87 -24
- agno/tools/file_generation.py +12 -6
- agno/tools/firecrawl.py +15 -7
- agno/tools/function.py +37 -23
- agno/tools/shopify.py +1519 -0
- agno/tools/spotify.py +2 -5
- agno/utils/hooks.py +64 -5
- agno/utils/http.py +2 -2
- agno/utils/media.py +11 -1
- agno/utils/print_response/agent.py +8 -0
- agno/utils/print_response/team.py +8 -0
- agno/vectordb/pgvector/pgvector.py +88 -51
- agno/workflow/parallel.py +5 -3
- agno/workflow/step.py +14 -2
- agno/workflow/types.py +38 -2
- agno/workflow/workflow.py +12 -4
- {agno-2.3.8.dist-info → agno-2.3.10.dist-info}/METADATA +7 -2
- {agno-2.3.8.dist-info → agno-2.3.10.dist-info}/RECORD +66 -52
- {agno-2.3.8.dist-info → agno-2.3.10.dist-info}/WHEEL +0 -0
- {agno-2.3.8.dist-info → agno-2.3.10.dist-info}/licenses/LICENSE +0 -0
- {agno-2.3.8.dist-info → agno-2.3.10.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
from typing import Any, Dict, Optional
|
|
2
|
+
from uuid import uuid4
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
from agno.agent import Agent
|
|
7
|
+
from agno.models.message import Message
|
|
8
|
+
from agno.os.schema import ModelResponse
|
|
9
|
+
from agno.os.utils import (
|
|
10
|
+
format_tools,
|
|
11
|
+
get_agent_input_schema_dict,
|
|
12
|
+
)
|
|
13
|
+
from agno.run import RunContext
|
|
14
|
+
from agno.run.agent import RunOutput
|
|
15
|
+
from agno.session import AgentSession
|
|
16
|
+
from agno.utils.agent import aexecute_instructions, aexecute_system_message
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class AgentResponse(BaseModel):
|
|
20
|
+
id: Optional[str] = None
|
|
21
|
+
name: Optional[str] = None
|
|
22
|
+
db_id: Optional[str] = None
|
|
23
|
+
model: Optional[ModelResponse] = None
|
|
24
|
+
tools: Optional[Dict[str, Any]] = None
|
|
25
|
+
sessions: Optional[Dict[str, Any]] = None
|
|
26
|
+
knowledge: Optional[Dict[str, Any]] = None
|
|
27
|
+
memory: Optional[Dict[str, Any]] = None
|
|
28
|
+
reasoning: Optional[Dict[str, Any]] = None
|
|
29
|
+
default_tools: Optional[Dict[str, Any]] = None
|
|
30
|
+
system_message: Optional[Dict[str, Any]] = None
|
|
31
|
+
extra_messages: Optional[Dict[str, Any]] = None
|
|
32
|
+
response_settings: Optional[Dict[str, Any]] = None
|
|
33
|
+
introduction: Optional[str] = None
|
|
34
|
+
streaming: Optional[Dict[str, Any]] = None
|
|
35
|
+
metadata: Optional[Dict[str, Any]] = None
|
|
36
|
+
input_schema: Optional[Dict[str, Any]] = None
|
|
37
|
+
|
|
38
|
+
@classmethod
|
|
39
|
+
async def from_agent(cls, agent: Agent) -> "AgentResponse":
|
|
40
|
+
def filter_meaningful_config(d: Dict[str, Any], defaults: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
|
41
|
+
"""Filter out fields that match their default values, keeping only meaningful user configurations"""
|
|
42
|
+
filtered = {}
|
|
43
|
+
for key, value in d.items():
|
|
44
|
+
if value is None:
|
|
45
|
+
continue
|
|
46
|
+
# Skip if value matches the default exactly
|
|
47
|
+
if key in defaults and value == defaults[key]:
|
|
48
|
+
continue
|
|
49
|
+
# Keep non-default values
|
|
50
|
+
filtered[key] = value
|
|
51
|
+
return filtered if filtered else None
|
|
52
|
+
|
|
53
|
+
# Define default values for filtering
|
|
54
|
+
agent_defaults = {
|
|
55
|
+
# Sessions defaults
|
|
56
|
+
"add_history_to_context": False,
|
|
57
|
+
"num_history_runs": 3,
|
|
58
|
+
"enable_session_summaries": False,
|
|
59
|
+
"search_session_history": False,
|
|
60
|
+
"cache_session": False,
|
|
61
|
+
# Knowledge defaults
|
|
62
|
+
"add_references": False,
|
|
63
|
+
"references_format": "json",
|
|
64
|
+
"enable_agentic_knowledge_filters": False,
|
|
65
|
+
# Memory defaults
|
|
66
|
+
"enable_agentic_memory": False,
|
|
67
|
+
"enable_user_memories": False,
|
|
68
|
+
# Reasoning defaults
|
|
69
|
+
"reasoning": False,
|
|
70
|
+
"reasoning_min_steps": 1,
|
|
71
|
+
"reasoning_max_steps": 10,
|
|
72
|
+
# Default tools defaults
|
|
73
|
+
"read_chat_history": False,
|
|
74
|
+
"search_knowledge": True,
|
|
75
|
+
"update_knowledge": False,
|
|
76
|
+
"read_tool_call_history": False,
|
|
77
|
+
# System message defaults
|
|
78
|
+
"system_message_role": "system",
|
|
79
|
+
"build_context": True,
|
|
80
|
+
"markdown": False,
|
|
81
|
+
"add_name_to_context": False,
|
|
82
|
+
"add_datetime_to_context": False,
|
|
83
|
+
"add_location_to_context": False,
|
|
84
|
+
"resolve_in_context": True,
|
|
85
|
+
# Extra messages defaults
|
|
86
|
+
"user_message_role": "user",
|
|
87
|
+
"build_user_context": True,
|
|
88
|
+
# Response settings defaults
|
|
89
|
+
"retries": 0,
|
|
90
|
+
"delay_between_retries": 1,
|
|
91
|
+
"exponential_backoff": False,
|
|
92
|
+
"parse_response": True,
|
|
93
|
+
"use_json_mode": False,
|
|
94
|
+
# Streaming defaults
|
|
95
|
+
"stream_events": False,
|
|
96
|
+
"stream_intermediate_steps": False,
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
session_id = str(uuid4())
|
|
100
|
+
run_id = str(uuid4())
|
|
101
|
+
agent_tools = await agent.aget_tools(
|
|
102
|
+
session=AgentSession(session_id=session_id, session_data={}),
|
|
103
|
+
run_response=RunOutput(run_id=run_id, session_id=session_id),
|
|
104
|
+
run_context=RunContext(run_id=run_id, session_id=session_id, user_id=agent.user_id),
|
|
105
|
+
check_mcp_tools=False,
|
|
106
|
+
)
|
|
107
|
+
formatted_tools = format_tools(agent_tools) if agent_tools else None
|
|
108
|
+
|
|
109
|
+
additional_input = agent.additional_input
|
|
110
|
+
if additional_input and isinstance(additional_input[0], Message):
|
|
111
|
+
additional_input = [message.to_dict() for message in additional_input] # type: ignore
|
|
112
|
+
|
|
113
|
+
# Build model only if it has at least one non-null field
|
|
114
|
+
model_name = agent.model.name if (agent.model and agent.model.name) else None
|
|
115
|
+
model_provider = agent.model.provider if (agent.model and agent.model.provider) else None
|
|
116
|
+
model_id = agent.model.id if (agent.model and agent.model.id) else None
|
|
117
|
+
_agent_model_data: Dict[str, Any] = {}
|
|
118
|
+
if model_name is not None:
|
|
119
|
+
_agent_model_data["name"] = model_name
|
|
120
|
+
if model_id is not None:
|
|
121
|
+
_agent_model_data["model"] = model_id
|
|
122
|
+
if model_provider is not None:
|
|
123
|
+
_agent_model_data["provider"] = model_provider
|
|
124
|
+
|
|
125
|
+
session_table = agent.db.session_table_name if agent.db else None
|
|
126
|
+
knowledge_table = agent.db.knowledge_table_name if agent.db and agent.knowledge else None
|
|
127
|
+
|
|
128
|
+
tools_info = {
|
|
129
|
+
"tools": formatted_tools,
|
|
130
|
+
"tool_call_limit": agent.tool_call_limit,
|
|
131
|
+
"tool_choice": agent.tool_choice,
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
sessions_info = {
|
|
135
|
+
"session_table": session_table,
|
|
136
|
+
"add_history_to_context": agent.add_history_to_context,
|
|
137
|
+
"enable_session_summaries": agent.enable_session_summaries,
|
|
138
|
+
"num_history_runs": agent.num_history_runs,
|
|
139
|
+
"search_session_history": agent.search_session_history,
|
|
140
|
+
"num_history_sessions": agent.num_history_sessions,
|
|
141
|
+
"cache_session": agent.cache_session,
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
knowledge_info = {
|
|
145
|
+
"knowledge_table": knowledge_table,
|
|
146
|
+
"enable_agentic_knowledge_filters": agent.enable_agentic_knowledge_filters,
|
|
147
|
+
"knowledge_filters": agent.knowledge_filters,
|
|
148
|
+
"references_format": agent.references_format,
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
memory_info: Optional[Dict[str, Any]] = None
|
|
152
|
+
if agent.memory_manager is not None:
|
|
153
|
+
memory_info = {
|
|
154
|
+
"enable_agentic_memory": agent.enable_agentic_memory,
|
|
155
|
+
"enable_user_memories": agent.enable_user_memories,
|
|
156
|
+
"metadata": agent.metadata,
|
|
157
|
+
"memory_table": agent.db.memory_table_name if agent.db and agent.enable_user_memories else None,
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
if agent.memory_manager.model is not None:
|
|
161
|
+
memory_info["model"] = ModelResponse(
|
|
162
|
+
name=agent.memory_manager.model.name,
|
|
163
|
+
model=agent.memory_manager.model.id,
|
|
164
|
+
provider=agent.memory_manager.model.provider,
|
|
165
|
+
).model_dump()
|
|
166
|
+
|
|
167
|
+
reasoning_info: Dict[str, Any] = {
|
|
168
|
+
"reasoning": agent.reasoning,
|
|
169
|
+
"reasoning_agent_id": agent.reasoning_agent.id if agent.reasoning_agent else None,
|
|
170
|
+
"reasoning_min_steps": agent.reasoning_min_steps,
|
|
171
|
+
"reasoning_max_steps": agent.reasoning_max_steps,
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
if agent.reasoning_model:
|
|
175
|
+
reasoning_info["reasoning_model"] = ModelResponse(
|
|
176
|
+
name=agent.reasoning_model.name,
|
|
177
|
+
model=agent.reasoning_model.id,
|
|
178
|
+
provider=agent.reasoning_model.provider,
|
|
179
|
+
).model_dump()
|
|
180
|
+
|
|
181
|
+
default_tools_info = {
|
|
182
|
+
"read_chat_history": agent.read_chat_history,
|
|
183
|
+
"search_knowledge": agent.search_knowledge,
|
|
184
|
+
"update_knowledge": agent.update_knowledge,
|
|
185
|
+
"read_tool_call_history": agent.read_tool_call_history,
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
instructions = agent.instructions if agent.instructions else None
|
|
189
|
+
if instructions and callable(instructions):
|
|
190
|
+
instructions = await aexecute_instructions(instructions=instructions, agent=agent)
|
|
191
|
+
|
|
192
|
+
system_message = agent.system_message if agent.system_message else None
|
|
193
|
+
if system_message and callable(system_message):
|
|
194
|
+
system_message = await aexecute_system_message(system_message=system_message, agent=agent)
|
|
195
|
+
|
|
196
|
+
system_message_info = {
|
|
197
|
+
"system_message": str(system_message) if system_message else None,
|
|
198
|
+
"system_message_role": agent.system_message_role,
|
|
199
|
+
"build_context": agent.build_context,
|
|
200
|
+
"description": agent.description,
|
|
201
|
+
"instructions": instructions,
|
|
202
|
+
"expected_output": agent.expected_output,
|
|
203
|
+
"additional_context": agent.additional_context,
|
|
204
|
+
"markdown": agent.markdown,
|
|
205
|
+
"add_name_to_context": agent.add_name_to_context,
|
|
206
|
+
"add_datetime_to_context": agent.add_datetime_to_context,
|
|
207
|
+
"add_location_to_context": agent.add_location_to_context,
|
|
208
|
+
"timezone_identifier": agent.timezone_identifier,
|
|
209
|
+
"resolve_in_context": agent.resolve_in_context,
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
extra_messages_info = {
|
|
213
|
+
"additional_input": additional_input, # type: ignore
|
|
214
|
+
"user_message_role": agent.user_message_role,
|
|
215
|
+
"build_user_context": agent.build_user_context,
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
response_settings_info: Dict[str, Any] = {
|
|
219
|
+
"retries": agent.retries,
|
|
220
|
+
"delay_between_retries": agent.delay_between_retries,
|
|
221
|
+
"exponential_backoff": agent.exponential_backoff,
|
|
222
|
+
"output_schema_name": agent.output_schema.__name__ if agent.output_schema else None,
|
|
223
|
+
"parser_model_prompt": agent.parser_model_prompt,
|
|
224
|
+
"parse_response": agent.parse_response,
|
|
225
|
+
"structured_outputs": agent.structured_outputs,
|
|
226
|
+
"use_json_mode": agent.use_json_mode,
|
|
227
|
+
"save_response_to_file": agent.save_response_to_file,
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
if agent.parser_model:
|
|
231
|
+
response_settings_info["parser_model"] = ModelResponse(
|
|
232
|
+
name=agent.parser_model.name,
|
|
233
|
+
model=agent.parser_model.id,
|
|
234
|
+
provider=agent.parser_model.provider,
|
|
235
|
+
).model_dump()
|
|
236
|
+
|
|
237
|
+
streaming_info = {
|
|
238
|
+
"stream": agent.stream,
|
|
239
|
+
"stream_events": agent.stream_events,
|
|
240
|
+
"stream_intermediate_steps": agent.stream_intermediate_steps,
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
return AgentResponse(
|
|
244
|
+
id=agent.id,
|
|
245
|
+
name=agent.name,
|
|
246
|
+
db_id=agent.db.id if agent.db else None,
|
|
247
|
+
model=ModelResponse(**_agent_model_data) if _agent_model_data else None,
|
|
248
|
+
tools=filter_meaningful_config(tools_info, {}),
|
|
249
|
+
sessions=filter_meaningful_config(sessions_info, agent_defaults),
|
|
250
|
+
knowledge=filter_meaningful_config(knowledge_info, agent_defaults),
|
|
251
|
+
memory=filter_meaningful_config(memory_info, agent_defaults) if memory_info else None,
|
|
252
|
+
reasoning=filter_meaningful_config(reasoning_info, agent_defaults),
|
|
253
|
+
default_tools=filter_meaningful_config(default_tools_info, agent_defaults),
|
|
254
|
+
system_message=filter_meaningful_config(system_message_info, agent_defaults),
|
|
255
|
+
extra_messages=filter_meaningful_config(extra_messages_info, agent_defaults),
|
|
256
|
+
response_settings=filter_meaningful_config(response_settings_info, agent_defaults),
|
|
257
|
+
streaming=filter_meaningful_config(streaming_info, agent_defaults),
|
|
258
|
+
introduction=agent.introduction,
|
|
259
|
+
metadata=agent.metadata,
|
|
260
|
+
input_schema=get_agent_input_schema_dict(agent),
|
|
261
|
+
)
|
agno/os/routers/evals/evals.py
CHANGED
|
@@ -15,7 +15,12 @@ from agno.os.routers.evals.schemas import (
|
|
|
15
15
|
EvalSchema,
|
|
16
16
|
UpdateEvalRunRequest,
|
|
17
17
|
)
|
|
18
|
-
from agno.os.routers.evals.utils import
|
|
18
|
+
from agno.os.routers.evals.utils import (
|
|
19
|
+
run_accuracy_eval,
|
|
20
|
+
run_agent_as_judge_eval,
|
|
21
|
+
run_performance_eval,
|
|
22
|
+
run_reliability_eval,
|
|
23
|
+
)
|
|
19
24
|
from agno.os.schema import (
|
|
20
25
|
BadRequestResponse,
|
|
21
26
|
InternalServerErrorResponse,
|
|
@@ -119,6 +124,15 @@ def attach_routes(
|
|
|
119
124
|
) -> PaginatedResponse[EvalSchema]:
|
|
120
125
|
db = await get_db(dbs, db_id, table)
|
|
121
126
|
|
|
127
|
+
# TODO: Delete me:
|
|
128
|
+
# Filtering out agent-as-judge by default for now,
|
|
129
|
+
# as they are not supported yet in the AgentOS UI.
|
|
130
|
+
eval_types = eval_types or [
|
|
131
|
+
EvalType.ACCURACY,
|
|
132
|
+
EvalType.PERFORMANCE,
|
|
133
|
+
EvalType.RELIABILITY,
|
|
134
|
+
]
|
|
135
|
+
|
|
122
136
|
if isinstance(db, AsyncBaseDb):
|
|
123
137
|
db = cast(AsyncBaseDb, db)
|
|
124
138
|
eval_runs, total_count = await db.get_eval_runs(
|
|
@@ -304,7 +318,7 @@ def attach_routes(
|
|
|
304
318
|
operation_id="run_eval",
|
|
305
319
|
summary="Execute Evaluation",
|
|
306
320
|
description=(
|
|
307
|
-
"Run evaluation tests on agents or teams. Supports accuracy, performance, and reliability evaluations. "
|
|
321
|
+
"Run evaluation tests on agents or teams. Supports accuracy, agent-as-judge, performance, and reliability evaluations. "
|
|
308
322
|
"Requires either agent_id or team_id, but not both."
|
|
309
323
|
),
|
|
310
324
|
responses={
|
|
@@ -374,6 +388,7 @@ def attach_routes(
|
|
|
374
388
|
if not team:
|
|
375
389
|
raise HTTPException(status_code=404, detail=f"Team with id '{eval_run_input.team_id}' not found")
|
|
376
390
|
|
|
391
|
+
# If model_id/model_provider specified, override team's model temporarily
|
|
377
392
|
default_model = None
|
|
378
393
|
if (
|
|
379
394
|
hasattr(team, "model")
|
|
@@ -381,13 +396,13 @@ def attach_routes(
|
|
|
381
396
|
and eval_run_input.model_id is not None
|
|
382
397
|
and eval_run_input.model_provider is not None
|
|
383
398
|
):
|
|
384
|
-
default_model = deepcopy(team.model)
|
|
399
|
+
default_model = deepcopy(team.model) # Save original
|
|
385
400
|
if eval_run_input.model_id != team.model.id or eval_run_input.model_provider != team.model.provider:
|
|
386
401
|
model_provider = eval_run_input.model_provider.lower()
|
|
387
402
|
model_id = eval_run_input.model_id.lower()
|
|
388
403
|
model_string = f"{model_provider}:{model_id}"
|
|
389
404
|
model = get_model(model_string)
|
|
390
|
-
team.model = model
|
|
405
|
+
team.model = model # Override temporarily
|
|
391
406
|
|
|
392
407
|
agent = None
|
|
393
408
|
|
|
@@ -400,6 +415,11 @@ def attach_routes(
|
|
|
400
415
|
eval_run_input=eval_run_input, db=db, agent=agent, team=team, default_model=default_model
|
|
401
416
|
)
|
|
402
417
|
|
|
418
|
+
elif eval_run_input.eval_type == EvalType.AGENT_AS_JUDGE:
|
|
419
|
+
return await run_agent_as_judge_eval(
|
|
420
|
+
eval_run_input=eval_run_input, db=db, agent=agent, team=team, default_model=default_model
|
|
421
|
+
)
|
|
422
|
+
|
|
403
423
|
elif eval_run_input.eval_type == EvalType.PERFORMANCE:
|
|
404
424
|
return await run_performance_eval(
|
|
405
425
|
eval_run_input=eval_run_input, db=db, agent=agent, team=team, default_model=default_model
|
|
@@ -416,8 +436,8 @@ def attach_routes(
|
|
|
416
436
|
def parse_eval_types_filter(
|
|
417
437
|
eval_types: Optional[str] = Query(
|
|
418
438
|
default=None,
|
|
419
|
-
description="Comma-separated eval types (accuracy,performance,reliability)",
|
|
420
|
-
examples=["accuracy,performance"],
|
|
439
|
+
description="Comma-separated eval types (accuracy,agent_as_judge,performance,reliability)",
|
|
440
|
+
examples=["accuracy,agent_as_judge,performance,reliability"],
|
|
421
441
|
),
|
|
422
442
|
) -> Optional[List[EvalType]]:
|
|
423
443
|
"""Parse comma-separated eval types into EvalType enums for filtering evaluation runs."""
|
agno/os/routers/evals/schemas.py
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
from dataclasses import asdict
|
|
2
2
|
from datetime import datetime, timezone
|
|
3
|
-
from typing import Any, Dict, List, Optional
|
|
3
|
+
from typing import Any, Dict, List, Literal, Optional
|
|
4
4
|
|
|
5
5
|
from pydantic import BaseModel, Field
|
|
6
6
|
|
|
7
7
|
from agno.db.schemas.evals import EvalType
|
|
8
|
-
from agno.eval import AccuracyResult, PerformanceResult, ReliabilityResult
|
|
8
|
+
from agno.eval import AccuracyResult, AgentAsJudgeResult, PerformanceResult, ReliabilityResult
|
|
9
9
|
from agno.eval.accuracy import AccuracyEval
|
|
10
|
+
from agno.eval.agent_as_judge import AgentAsJudgeEval
|
|
10
11
|
from agno.eval.performance import PerformanceEval
|
|
11
12
|
from agno.eval.reliability import ReliabilityEval
|
|
12
13
|
|
|
@@ -27,6 +28,15 @@ class EvalRunInput(BaseModel):
|
|
|
27
28
|
# Accuracy eval specific fields
|
|
28
29
|
expected_output: Optional[str] = Field(None, description="Expected output for accuracy evaluation")
|
|
29
30
|
|
|
31
|
+
# AgentAsJudge eval specific fields
|
|
32
|
+
criteria: Optional[str] = Field(None, description="Evaluation criteria for agent-as-judge evaluation")
|
|
33
|
+
scoring_strategy: Optional[Literal["numeric", "binary"]] = Field(
|
|
34
|
+
"binary", description="Scoring strategy: 'numeric' (1-10 with threshold) or 'binary' (PASS/FAIL)"
|
|
35
|
+
)
|
|
36
|
+
threshold: Optional[int] = Field(
|
|
37
|
+
7, description="Score threshold for pass/fail (1-10), only used with numeric scoring", ge=1, le=10
|
|
38
|
+
)
|
|
39
|
+
|
|
30
40
|
# Performance eval specific fields
|
|
31
41
|
warmup_runs: int = Field(0, description="Number of warmup runs before measuring performance", ge=0, le=10)
|
|
32
42
|
|
|
@@ -89,6 +99,28 @@ class EvalSchema(BaseModel):
|
|
|
89
99
|
eval_data=asdict(result),
|
|
90
100
|
)
|
|
91
101
|
|
|
102
|
+
@classmethod
|
|
103
|
+
def from_agent_as_judge_eval(
|
|
104
|
+
cls,
|
|
105
|
+
agent_as_judge_eval: AgentAsJudgeEval,
|
|
106
|
+
result: AgentAsJudgeResult,
|
|
107
|
+
model_id: Optional[str] = None,
|
|
108
|
+
model_provider: Optional[str] = None,
|
|
109
|
+
agent_id: Optional[str] = None,
|
|
110
|
+
team_id: Optional[str] = None,
|
|
111
|
+
) -> "EvalSchema":
|
|
112
|
+
return cls(
|
|
113
|
+
id=result.run_id,
|
|
114
|
+
name=agent_as_judge_eval.name,
|
|
115
|
+
agent_id=agent_id,
|
|
116
|
+
team_id=team_id,
|
|
117
|
+
workflow_id=None,
|
|
118
|
+
model_id=model_id,
|
|
119
|
+
model_provider=model_provider,
|
|
120
|
+
eval_type=EvalType.AGENT_AS_JUDGE,
|
|
121
|
+
eval_data=asdict(result),
|
|
122
|
+
)
|
|
123
|
+
|
|
92
124
|
@classmethod
|
|
93
125
|
def from_performance_eval(
|
|
94
126
|
cls,
|
agno/os/routers/evals/utils.py
CHANGED
|
@@ -5,6 +5,7 @@ from fastapi import HTTPException
|
|
|
5
5
|
from agno.agent.agent import Agent
|
|
6
6
|
from agno.db.base import AsyncBaseDb, BaseDb
|
|
7
7
|
from agno.eval.accuracy import AccuracyEval
|
|
8
|
+
from agno.eval.agent_as_judge import AgentAsJudgeEval
|
|
8
9
|
from agno.eval.performance import PerformanceEval
|
|
9
10
|
from agno.eval.reliability import ReliabilityEval
|
|
10
11
|
from agno.models.base import Model
|
|
@@ -36,15 +37,77 @@ async def run_accuracy_eval(
|
|
|
36
37
|
model=default_model,
|
|
37
38
|
)
|
|
38
39
|
|
|
39
|
-
|
|
40
|
-
result = await accuracy_eval.arun(print_results=False, print_summary=False)
|
|
41
|
-
else:
|
|
42
|
-
result = accuracy_eval.run(print_results=False, print_summary=False)
|
|
40
|
+
result = await accuracy_eval.arun(print_results=False, print_summary=False)
|
|
43
41
|
if not result:
|
|
44
42
|
raise HTTPException(status_code=500, detail="Failed to run accuracy evaluation")
|
|
45
43
|
|
|
46
44
|
eval_run = EvalSchema.from_accuracy_eval(accuracy_eval=accuracy_eval, result=result)
|
|
47
45
|
|
|
46
|
+
# Restore original model after eval
|
|
47
|
+
if default_model is not None:
|
|
48
|
+
if agent is not None:
|
|
49
|
+
agent.model = default_model
|
|
50
|
+
elif team is not None:
|
|
51
|
+
team.model = default_model
|
|
52
|
+
|
|
53
|
+
return eval_run
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
async def run_agent_as_judge_eval(
|
|
57
|
+
eval_run_input: EvalRunInput,
|
|
58
|
+
db: Union[BaseDb, AsyncBaseDb],
|
|
59
|
+
agent: Optional[Agent] = None,
|
|
60
|
+
team: Optional[Team] = None,
|
|
61
|
+
default_model: Optional[Model] = None,
|
|
62
|
+
) -> EvalSchema:
|
|
63
|
+
"""Run an AgentAsJudge evaluation for the given agent or team"""
|
|
64
|
+
if not eval_run_input.criteria:
|
|
65
|
+
raise HTTPException(status_code=400, detail="criteria is required for agent-as-judge evaluation")
|
|
66
|
+
|
|
67
|
+
# Run agent/team to get output
|
|
68
|
+
if agent:
|
|
69
|
+
agent_response = await agent.arun(eval_run_input.input, stream=False)
|
|
70
|
+
output = str(agent_response.content) if agent_response.content else ""
|
|
71
|
+
model_id = agent.model.id if agent and agent.model else None
|
|
72
|
+
model_provider = agent.model.provider if agent and agent.model else None
|
|
73
|
+
agent_id = agent.id
|
|
74
|
+
team_id = None
|
|
75
|
+
elif team:
|
|
76
|
+
team_response = await team.arun(eval_run_input.input, stream=False)
|
|
77
|
+
output = str(team_response.content) if team_response.content else ""
|
|
78
|
+
model_id = team.model.id if team and team.model else None
|
|
79
|
+
model_provider = team.model.provider if team and team.model else None
|
|
80
|
+
agent_id = None
|
|
81
|
+
team_id = team.id
|
|
82
|
+
else:
|
|
83
|
+
raise HTTPException(status_code=400, detail="Either agent_id or team_id must be provided")
|
|
84
|
+
|
|
85
|
+
agent_as_judge_eval = AgentAsJudgeEval(
|
|
86
|
+
db=db,
|
|
87
|
+
criteria=eval_run_input.criteria,
|
|
88
|
+
scoring_strategy=eval_run_input.scoring_strategy or "binary",
|
|
89
|
+
threshold=eval_run_input.threshold or 7,
|
|
90
|
+
additional_guidelines=eval_run_input.additional_guidelines,
|
|
91
|
+
name=eval_run_input.name,
|
|
92
|
+
model=default_model,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
result = await agent_as_judge_eval.arun(
|
|
96
|
+
input=eval_run_input.input, output=output, print_results=False, print_summary=False
|
|
97
|
+
)
|
|
98
|
+
if not result:
|
|
99
|
+
raise HTTPException(status_code=500, detail="Failed to run agent as judge evaluation")
|
|
100
|
+
|
|
101
|
+
eval_run = EvalSchema.from_agent_as_judge_eval(
|
|
102
|
+
agent_as_judge_eval=agent_as_judge_eval,
|
|
103
|
+
result=result,
|
|
104
|
+
agent_id=agent_id,
|
|
105
|
+
team_id=team_id,
|
|
106
|
+
model_id=model_id,
|
|
107
|
+
model_provider=model_provider,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# Restore original model after eval
|
|
48
111
|
if default_model is not None:
|
|
49
112
|
if agent is not None:
|
|
50
113
|
agent.model = default_model
|
|
@@ -64,16 +127,16 @@ async def run_performance_eval(
|
|
|
64
127
|
"""Run a performance evaluation for the given agent or team"""
|
|
65
128
|
if agent:
|
|
66
129
|
|
|
67
|
-
def run_component(): # type: ignore
|
|
68
|
-
return agent.
|
|
130
|
+
async def run_component(): # type: ignore
|
|
131
|
+
return await agent.arun(eval_run_input.input, stream=False)
|
|
69
132
|
|
|
70
133
|
model_id = agent.model.id if agent and agent.model else None
|
|
71
134
|
model_provider = agent.model.provider if agent and agent.model else None
|
|
72
135
|
|
|
73
136
|
elif team:
|
|
74
137
|
|
|
75
|
-
def run_component():
|
|
76
|
-
return team.
|
|
138
|
+
async def run_component(): # type: ignore
|
|
139
|
+
return await team.arun(eval_run_input.input, stream=False)
|
|
77
140
|
|
|
78
141
|
model_id = team.model.id if team and team.model else None
|
|
79
142
|
model_provider = team.model.provider if team and team.model else None
|
|
@@ -90,10 +153,7 @@ async def run_performance_eval(
|
|
|
90
153
|
model_provider=model_provider,
|
|
91
154
|
)
|
|
92
155
|
|
|
93
|
-
|
|
94
|
-
result = await performance_eval.arun(print_results=False, print_summary=False)
|
|
95
|
-
else:
|
|
96
|
-
result = performance_eval.run(print_results=False, print_summary=False)
|
|
156
|
+
result = await performance_eval.arun(print_results=False, print_summary=False)
|
|
97
157
|
if not result:
|
|
98
158
|
raise HTTPException(status_code=500, detail="Failed to run performance evaluation")
|
|
99
159
|
|
|
@@ -106,6 +166,7 @@ async def run_performance_eval(
|
|
|
106
166
|
model_provider=model_provider,
|
|
107
167
|
)
|
|
108
168
|
|
|
169
|
+
# Restore original model after eval
|
|
109
170
|
if default_model is not None:
|
|
110
171
|
if agent is not None:
|
|
111
172
|
agent.model = default_model
|
|
@@ -127,7 +188,7 @@ async def run_reliability_eval(
|
|
|
127
188
|
raise HTTPException(status_code=400, detail="expected_tool_calls is required for reliability evaluations")
|
|
128
189
|
|
|
129
190
|
if agent:
|
|
130
|
-
agent_response = agent.
|
|
191
|
+
agent_response = await agent.arun(eval_run_input.input, stream=False)
|
|
131
192
|
reliability_eval = ReliabilityEval(
|
|
132
193
|
db=db,
|
|
133
194
|
name=eval_run_input.name,
|
|
@@ -138,7 +199,7 @@ async def run_reliability_eval(
|
|
|
138
199
|
model_provider = agent.model.provider if agent and agent.model else None
|
|
139
200
|
|
|
140
201
|
elif team:
|
|
141
|
-
team_response = team.
|
|
202
|
+
team_response = await team.arun(eval_run_input.input, stream=False)
|
|
142
203
|
reliability_eval = ReliabilityEval(
|
|
143
204
|
db=db,
|
|
144
205
|
name=eval_run_input.name,
|
|
@@ -148,10 +209,7 @@ async def run_reliability_eval(
|
|
|
148
209
|
model_id = team.model.id if team and team.model else None
|
|
149
210
|
model_provider = team.model.provider if team and team.model else None
|
|
150
211
|
|
|
151
|
-
|
|
152
|
-
result = await reliability_eval.arun(print_results=False)
|
|
153
|
-
else:
|
|
154
|
-
result = reliability_eval.run(print_results=False)
|
|
212
|
+
result = await reliability_eval.arun(print_results=False)
|
|
155
213
|
if not result:
|
|
156
214
|
raise HTTPException(status_code=500, detail="Failed to run reliability evaluation")
|
|
157
215
|
|
|
@@ -163,6 +221,7 @@ async def run_reliability_eval(
|
|
|
163
221
|
model_provider=model_provider,
|
|
164
222
|
)
|
|
165
223
|
|
|
224
|
+
# Restore original model after eval
|
|
166
225
|
if default_model is not None:
|
|
167
226
|
if agent is not None:
|
|
168
227
|
agent.model = default_model
|
|
@@ -981,7 +981,7 @@ async def process_content(
|
|
|
981
981
|
log_debug(f"Set chunking strategy: {chunker}")
|
|
982
982
|
|
|
983
983
|
log_debug(f"Using reader: {content.reader.__class__.__name__}")
|
|
984
|
-
await knowledge.
|
|
984
|
+
await knowledge._load_content_async(content, upsert=False, skip_if_exists=True)
|
|
985
985
|
log_info(f"Content {content.id} processed successfully")
|
|
986
986
|
except Exception as e:
|
|
987
987
|
log_info(f"Error processing content: {e}")
|